visual_parser.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336
  1. from core.router import Parser
  2. from models.result import ParseResult
  3. from utils.logger import log
  4. import os
  5. import time
  6. import httpx
  7. import json
  8. import io
  9. import zipfile
  10. from pathlib import Path
  11. import asyncio
  12. # 延迟导入PaddleOCR,避免模块级初始化
  13. class VisualDocParser(Parser):
  14. """视觉文档解析器,处理图片和扫描件PDF"""
  15. def __init__(self):
  16. # MinerU API配置 - 使用本地部署的服务
  17. self.mineru_api_key = ""
  18. self.base_url = "http://10.192.72.13:7284"
  19. self.model_version = "hybrid-auto-engine"
  20. self.poll_interval_sec = 3.0
  21. self.max_wait_sec = 300.0
  22. log.info("VisualDocParser初始化完成,使用本地部署的MinerU服务")
  23. async def parse(self, file_path: str) -> ParseResult:
  24. """
  25. 解析视觉文档
  26. Args:
  27. file_path: 文件路径
  28. Returns:
  29. ParseResult: 解析结果
  30. """
  31. log.info(f"开始解析视觉文档: {file_path}")
  32. try:
  33. # 只使用MinerU API,避免PaddleOCR的初始化问题
  34. result = await self._try_mineru(file_path)
  35. if result:
  36. return result
  37. # MinerU失败时,返回错误信息
  38. return ParseResult(
  39. content="",
  40. metadata={"error": "MinerU API解析失败"},
  41. file_type="visual"
  42. )
  43. except Exception as e:
  44. log.error(f"视觉文档解析失败: {str(e)}")
  45. return ParseResult(
  46. content="",
  47. metadata={"error": str(e)},
  48. file_type="visual"
  49. )
  50. async def _try_mineru(self, file_path: str) -> ParseResult:
  51. """
  52. 尝试使用本地MinerU API解析
  53. Args:
  54. file_path: 文件路径
  55. Returns:
  56. ParseResult: 解析结果,如果失败返回None
  57. """
  58. try:
  59. log.info("开始使用本地MinerU API解析文件")
  60. file_path_obj = Path(file_path)
  61. if not file_path_obj.exists():
  62. raise FileNotFoundError(str(file_path))
  63. log.info(f"Calling local MinerU for file: {file_path}")
  64. # 直接使用本地API上传文件并获取结果
  65. result = await self._upload_and_parse(file_path_obj)
  66. # 提取文本内容
  67. text_content = self._extract_text_from_local_result(result)
  68. return ParseResult(
  69. content=text_content,
  70. metadata={
  71. "parser": "Local MinerU API",
  72. "file_size": file_path_obj.stat().st_size,
  73. "backend": self.model_version
  74. },
  75. file_type="visual"
  76. )
  77. except Exception as e:
  78. log.warning(f"本地MinerU API解析失败: {str(e)}")
  79. return None
  80. async def _upload_and_parse(self, file_path: Path) -> dict:
  81. """
  82. 上传文件并解析
  83. Args:
  84. file_path: 文件路径
  85. Returns:
  86. dict: 解析结果
  87. """
  88. url = f"{self.base_url}/file_parse"
  89. # 准备表单数据
  90. files = {
  91. "files": (file_path.name, open(file_path, 'rb'))
  92. }
  93. # 准备参数
  94. data = {
  95. "backend": self.model_version,
  96. "lang_list": ["ch"],
  97. "return_md": True,
  98. "formula_enable": True,
  99. "table_enable": True
  100. }
  101. async with httpx.AsyncClient(timeout=300) as client:
  102. resp = await client.post(url, files=files, data=data)
  103. resp.raise_for_status()
  104. result = resp.json()
  105. return result
  106. def _extract_text_from_local_result(self, result: dict) -> str:
  107. """
  108. 从本地MinerU返回的结果中提取文本内容
  109. Args:
  110. result: 本地MinerU返回的结果
  111. Returns:
  112. str: 提取的文本内容
  113. """
  114. # 处理不同可能的返回结构
  115. text_parts = []
  116. if isinstance(result, dict):
  117. # 检查是否有results字段(新的返回结构)
  118. if "results" in result:
  119. results = result["results"]
  120. if isinstance(results, dict):
  121. for key, value in results.items():
  122. if isinstance(value, dict):
  123. # 检查是否有md_content字段
  124. if "md_content" in value:
  125. text_parts.append(str(value["md_content"]))
  126. # 检查是否有text字段
  127. elif "text" in value:
  128. text_parts.append(str(value["text"]))
  129. # 检查是否有markdown内容
  130. elif "markdown" in result:
  131. text_parts.append(str(result["markdown"]))
  132. # 检查是否有text字段
  133. elif "text" in result:
  134. text_parts.append(str(result["text"]))
  135. # 检查是否有content字段
  136. elif "content" in result:
  137. if isinstance(result["content"], str):
  138. text_parts.append(result["content"])
  139. elif isinstance(result["content"], list):
  140. for item in result["content"]:
  141. if isinstance(item, dict) and "text" in item:
  142. text_parts.append(str(item["text"]))
  143. elif isinstance(item, str):
  144. text_parts.append(item)
  145. return "\n\n".join(text_parts)
  146. def _safe_stem(self, stem: str) -> str:
  147. """
  148. 创建安全的缓存键
  149. Args:
  150. stem: 文件stem
  151. Returns:
  152. str: 安全的缓存键
  153. """
  154. import re
  155. return re.sub(r'[^a-zA-Z0-9_-]', '_', stem)
  156. def _extract_text_from_payload(self, payload: dict) -> str:
  157. """
  158. 从MinerU返回的payload中提取文本内容
  159. Args:
  160. payload: MinerU返回的payload
  161. Returns:
  162. str: 提取的文本内容
  163. """
  164. # 根据MinerU API返回的结构提取文本
  165. text_parts = []
  166. # 处理不同可能的返回结构
  167. if isinstance(payload, dict):
  168. # 检查是否有text字段
  169. if "text" in payload:
  170. text_parts.append(str(payload["text"]))
  171. # 检查是否有content字段
  172. elif "content" in payload:
  173. if isinstance(payload["content"], str):
  174. text_parts.append(payload["content"])
  175. elif isinstance(payload["content"], list):
  176. for item in payload["content"]:
  177. if isinstance(item, dict) and "text" in item:
  178. text_parts.append(str(item["text"]))
  179. elif isinstance(item, str):
  180. text_parts.append(item)
  181. # 检查是否有pages字段
  182. elif "pages" in payload:
  183. for page_num, page_content in enumerate(payload["pages"], 1):
  184. text_parts.append(f"# 第{page_num}页")
  185. if isinstance(page_content, str):
  186. text_parts.append(page_content)
  187. elif isinstance(page_content, dict) and "text" in page_content:
  188. text_parts.append(str(page_content["text"]))
  189. elif isinstance(payload, list):
  190. for item in payload:
  191. if isinstance(item, dict) and "text" in item:
  192. text_parts.append(str(item["text"]))
  193. elif isinstance(item, str):
  194. text_parts.append(item)
  195. return "\n\n".join(text_parts)
  196. async def _use_paddleocr(self, file_path: str) -> ParseResult:
  197. """
  198. 使用PaddleOCR解析
  199. Args:
  200. file_path: 文件路径
  201. Returns:
  202. ParseResult: 解析结果
  203. """
  204. log.info("使用PaddleOCR解析视觉文档")
  205. # 检查PaddleOCR是否初始化成功
  206. if self.ocr is None:
  207. log.error("PaddleOCR未初始化,无法解析")
  208. return ParseResult(
  209. content="",
  210. metadata={"error": "PaddleOCR未初始化"},
  211. file_type="visual"
  212. )
  213. # 对于PDF文件,需要先转换为图片
  214. if file_path.endswith('.pdf'):
  215. return await self._ocr_pdf(file_path)
  216. else:
  217. return await self._ocr_image(file_path)
  218. async def _ocr_pdf(self, file_path: str) -> ParseResult:
  219. """
  220. OCR处理PDF文件
  221. Args:
  222. file_path: 文件路径
  223. Returns:
  224. ParseResult: 解析结果
  225. """
  226. import fitz # PyMuPDF
  227. doc = fitz.open(file_path)
  228. content = []
  229. page_count = len(doc)
  230. # 遍历所有页面
  231. for page_num in range(page_count):
  232. page = doc[page_num]
  233. # 将页面转换为图片
  234. pix = page.get_pixmap(dpi=300)
  235. img_path = f"temp_page_{page_num}.png"
  236. pix.save(img_path)
  237. # OCR处理图片
  238. ocr_result = self.ocr.ocr(img_path, cls=True)
  239. page_text = []
  240. for line in ocr_result:
  241. for word_info in line:
  242. page_text.append(word_info[1][0])
  243. content.append(f"# 第{page_num + 1}页\n{' '.join(page_text)}")
  244. # 删除临时图片
  245. if os.path.exists(img_path):
  246. os.remove(img_path)
  247. doc.close()
  248. return ParseResult(
  249. content="\n\n".join(content),
  250. metadata={
  251. "parser": "PaddleOCR",
  252. "page_count": page_count,
  253. "file_size": os.path.getsize(file_path)
  254. },
  255. file_type="pdf_scanned"
  256. )
  257. async def _ocr_image(self, file_path: str) -> ParseResult:
  258. """
  259. OCR处理图片文件
  260. Args:
  261. file_path: 文件路径
  262. Returns:
  263. ParseResult: 解析结果
  264. """
  265. # 使用PaddleOCR识别图片
  266. ocr_result = self.ocr.ocr(file_path, cls=True)
  267. content = []
  268. for line in ocr_result:
  269. for word_info in line:
  270. content.append(word_info[1][0])
  271. return ParseResult(
  272. content=' '.join(content),
  273. metadata={
  274. "parser": "PaddleOCR",
  275. "file_size": os.path.getsize(file_path)
  276. },
  277. file_type="image"
  278. )