router.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. from typing import Optional, Type
  2. from abc import ABC, abstractmethod
  3. import fitz # PyMuPDF
  4. from utils.mime_detector import MimeDetector
  5. from utils.logger import log
  6. from models.result import ParseResult
  7. class Parser(ABC):
  8. """解析器基类"""
  9. @abstractmethod
  10. async def parse(self, file_path: str) -> ParseResult:
  11. """
  12. 解析文件
  13. Args:
  14. file_path: 文件路径
  15. Returns:
  16. ParseResult: 解析结果
  17. """
  18. pass
  19. def _get_file_size(self, file_path: str) -> int:
  20. """
  21. 获取文件大小
  22. Args:
  23. file_path: 文件路径
  24. Returns:
  25. int: 文件大小(字节)
  26. """
  27. import os
  28. try:
  29. return os.path.getsize(file_path)
  30. except Exception:
  31. return 0
  32. class ParserFactory:
  33. """解析器工厂类"""
  34. def __init__(self):
  35. self.mime_detector = MimeDetector()
  36. self.parsers = {}
  37. self.parser_instances = {} # 缓存解析器实例
  38. # 统计信息
  39. self.stats = {
  40. 'total_files': 0,
  41. 'total_size': 0,
  42. 'text_files': 0,
  43. 'text_size': 0,
  44. 'image_files': 0,
  45. 'image_size': 0,
  46. 'audio_files': 0,
  47. 'audio_size': 0,
  48. 'video_files': 0,
  49. 'video_size': 0,
  50. 'pdf_files': 0,
  51. 'pdf_size': 0,
  52. 'office_files': 0,
  53. 'office_size': 0,
  54. 'total_time': 0,
  55. 'successful_files': 0,
  56. 'failed_files': 0
  57. }
  58. def register_parser(self, mime_type: str, parser_class: Type[Parser]):
  59. """
  60. 注册解析器
  61. Args:
  62. mime_type: MIME类型
  63. parser_class: 解析器类
  64. """
  65. self.parsers[mime_type] = parser_class
  66. async def get_parser(self, file_path: str) -> Parser:
  67. """
  68. 根据文件类型和内容特征获取合适的解析器
  69. Args:
  70. file_path: 文件路径
  71. Returns:
  72. Parser: 解析器实例
  73. """
  74. log.info(f"开始获取解析器,文件路径: {file_path}")
  75. # 1. 检测文件MIME类型
  76. mime_type = self.mime_detector.detect(file_path)
  77. log.info(f"文件MIME类型: {mime_type}")
  78. # 2. 第一层路由:根据MIME类型分流
  79. if mime_type.startswith("text/"):
  80. log.info("检测到文本文件,使用TextParser")
  81. if "TextParser" not in self.parser_instances:
  82. from parsers.text_parser import TextParser
  83. self.parser_instances["TextParser"] = TextParser()
  84. return self.parser_instances["TextParser"]
  85. elif mime_type.startswith("image/"):
  86. log.info("检测到图片文件,使用VisualDocParser")
  87. if "VisualDocParser" not in self.parser_instances:
  88. from parsers.visual_parser import VisualDocParser
  89. self.parser_instances["VisualDocParser"] = VisualDocParser()
  90. return self.parser_instances["VisualDocParser"]
  91. elif mime_type.startswith("audio/"):
  92. log.info("检测到音频文件,使用AudioParser")
  93. if "AudioParser" not in self.parser_instances:
  94. from parsers.audio_parser import AudioParser
  95. self.parser_instances["AudioParser"] = AudioParser()
  96. return self.parser_instances["AudioParser"]
  97. elif mime_type.startswith("video/"):
  98. log.info("检测到视频文件,使用VideoParser")
  99. if "VideoParser" not in self.parser_instances:
  100. from parsers.video_parser import VideoParser
  101. self.parser_instances["VideoParser"] = VideoParser()
  102. return self.parser_instances["VideoParser"]
  103. elif mime_type == "application/pdf":
  104. # 3. 第二层路由:PDF特殊处理
  105. log.info("检测到PDF文件,进入特殊路由")
  106. return await self._route_pdf(file_path)
  107. elif "openxmlformats" in mime_type or mime_type == "application/msword":
  108. # Office文件处理(包括docx和doc)
  109. log.info(f"检测到Office文件,MIME类型: {mime_type},使用NativeDocParser")
  110. return await self._route_office(file_path, mime_type)
  111. else:
  112. log.error(f"不支持的文件类型: {mime_type}")
  113. raise Exception(f"不支持的文件类型: {mime_type}")
  114. async def _route_pdf(self, file_path: str) -> Parser:
  115. """
  116. PDF文件路由逻辑
  117. Args:
  118. file_path: 文件路径
  119. Returns:
  120. Parser: 解析器实例
  121. """
  122. # 检测PDF是否为扫描件(文本密度检测)
  123. if self._is_scanned_pdf(file_path):
  124. log.info("PDF为扫描件,使用VisualDocParser")
  125. if "VisualDocParser" not in self.parser_instances:
  126. from parsers.visual_parser import VisualDocParser
  127. self.parser_instances["VisualDocParser"] = VisualDocParser()
  128. return self.parser_instances["VisualDocParser"]
  129. else:
  130. log.info("PDF为原生文档,使用NativeDocParser")
  131. if "NativeDocParser" not in self.parser_instances:
  132. from parsers.native_parser import NativeDocParser
  133. self.parser_instances["NativeDocParser"] = NativeDocParser()
  134. return self.parser_instances["NativeDocParser"]
  135. async def _route_office(self, file_path: str, mime_type: str) -> Parser:
  136. """
  137. Office文件路由逻辑
  138. Args:
  139. file_path: 文件路径
  140. mime_type: MIME类型
  141. Returns:
  142. Parser: 解析器实例
  143. """
  144. if "NativeDocParser" not in self.parser_instances:
  145. from parsers.native_parser import NativeDocParser
  146. self.parser_instances["NativeDocParser"] = NativeDocParser()
  147. return self.parser_instances["NativeDocParser"]
  148. def _is_scanned_pdf(self, file_path: str) -> bool:
  149. """
  150. 检测PDF是否为扫描件
  151. Args:
  152. file_path: 文件路径
  153. Returns:
  154. bool: 是否为扫描件
  155. """
  156. try:
  157. doc = fitz.open(file_path)
  158. text_content = ""
  159. # 提取前3页文本
  160. for page_num in range(min(3, len(doc))):
  161. page = doc[page_num]
  162. text_content += page.get_text()
  163. doc.close()
  164. # 计算有效字符数
  165. valid_chars = len([c for c in text_content if c.isalnum() or c.isspace()])
  166. log.info(f"PDF前3页有效字符数: {valid_chars}")
  167. # 如果有效字符数少于50,认为是扫描件
  168. return valid_chars < 50
  169. except Exception as e:
  170. log.error(f"PDF文本提取失败: {str(e)}")
  171. # 提取失败时默认使用VisualDocParser
  172. return True
  173. async def parse(self, file_path: str) -> ParseResult:
  174. """
  175. 解析文件的入口方法
  176. Args:
  177. file_path: 文件路径
  178. Returns:
  179. ParseResult: 解析结果
  180. """
  181. import time
  182. import os
  183. start_time = time.time()
  184. file_size = 0
  185. try:
  186. file_size = os.path.getsize(file_path)
  187. except Exception:
  188. pass
  189. log.info(f"开始解析文件: {file_path}, 文件大小: {file_size / (1024 * 1024):.2f} MB")
  190. try:
  191. parser = await self.get_parser(file_path)
  192. log.info(f"获取到解析器: {parser.__class__.__name__}")
  193. result = await parser.parse(file_path)
  194. end_time = time.time()
  195. elapsed_time = end_time - start_time
  196. # 更新统计信息
  197. self.stats['total_files'] += 1
  198. self.stats['total_size'] += file_size
  199. self.stats['total_time'] += elapsed_time
  200. self.stats['successful_files'] += 1
  201. # 根据文件类型更新统计
  202. file_type = result.file_type
  203. if file_type.startswith('text'):
  204. self.stats['text_files'] += 1
  205. self.stats['text_size'] += file_size
  206. elif file_type.startswith('image') or file_type == 'visual':
  207. self.stats['image_files'] += 1
  208. self.stats['image_size'] += file_size
  209. elif file_type.startswith('audio'):
  210. self.stats['audio_files'] += 1
  211. self.stats['audio_size'] += file_size
  212. elif file_type.startswith('video'):
  213. self.stats['video_files'] += 1
  214. self.stats['video_size'] += file_size
  215. elif file_type == 'pdf' or file_type == 'pdf_scanned':
  216. self.stats['pdf_files'] += 1
  217. self.stats['pdf_size'] += file_size
  218. elif file_type == 'office':
  219. self.stats['office_files'] += 1
  220. self.stats['office_size'] += file_size
  221. # 解析结果日志
  222. log.info(f"文件解析完成,耗时: {elapsed_time:.2f} 秒")
  223. log.info(f"文件类型: {result.file_type}")
  224. log.info(f"解析内容长度: {len(result.content)} 字符")
  225. log.info(f"元数据: {result.metadata}")
  226. if result.tables:
  227. log.info(f"提取到表格数量: {len(result.tables)}")
  228. return result
  229. except Exception as e:
  230. end_time = time.time()
  231. elapsed_time = end_time - start_time
  232. # 更新统计信息
  233. self.stats['total_files'] += 1
  234. self.stats['total_size'] += file_size
  235. self.stats['total_time'] += elapsed_time
  236. self.stats['failed_files'] += 1
  237. log.error(f"解析失败: {str(e)}, 耗时: {elapsed_time:.2f} 秒")
  238. # 返回错误结果
  239. return ParseResult(
  240. content=f"解析失败: {str(e)}",
  241. metadata={"error": str(e)},
  242. file_type="error"
  243. )
  244. def generate_performance_report(self) -> str:
  245. """
  246. 生成性能报告
  247. Returns:
  248. str: 性能报告
  249. """
  250. stats = self.stats
  251. # 计算各项指标
  252. total_files = stats['total_files']
  253. total_size = stats['total_size']
  254. total_time = stats['total_time']
  255. successful_files = stats['successful_files']
  256. failed_files = stats['failed_files']
  257. # 计算各类文件占比
  258. text_ratio = (stats['text_size'] / total_size * 100) if total_size > 0 else 0
  259. image_ratio = (stats['image_size'] / total_size * 100) if total_size > 0 else 0
  260. audio_ratio = (stats['audio_size'] / total_size * 100) if total_size > 0 else 0
  261. video_ratio = (stats['video_size'] / total_size * 100) if total_size > 0 else 0
  262. pdf_ratio = (stats['pdf_size'] / total_size * 100) if total_size > 0 else 0
  263. office_ratio = (stats['office_size'] / total_size * 100) if total_size > 0 else 0
  264. # 计算解析速度
  265. total_size_mb = total_size / (1024 * 1024)
  266. avg_speed = (total_size_mb / total_time) if total_time > 0 else 0
  267. # 生成报告
  268. report = f"""# 解析性能报告
  269. ## 总体情况
  270. - 总解析文件数: {total_files}
  271. - 成功解析: {successful_files}
  272. - 解析失败: {failed_files}
  273. - 总文件大小: {total_size_mb:.2f} MB
  274. - 总耗时: {total_time:.2f} 秒
  275. - 平均解析速度: {avg_speed:.2f} MB/秒
  276. ## 文件类型分布
  277. - 文本文件: {stats['text_files']} 个, {stats['text_size'] / (1024 * 1024):.2f} MB, 占比: {text_ratio:.2f}%
  278. - 图片文件: {stats['image_files']} 个, {stats['image_size'] / (1024 * 1024):.2f} MB, 占比: {image_ratio:.2f}%
  279. - 音频文件: {stats['audio_files']} 个, {stats['audio_size'] / (1024 * 1024):.2f} MB, 占比: {audio_ratio:.2f}%
  280. - 视频文件: {stats['video_files']} 个, {stats['video_size'] / (1024 * 1024):.2f} MB, 占比: {video_ratio:.2f}%
  281. - PDF文件: {stats['pdf_files']} 个, {stats['pdf_size'] / (1024 * 1024):.2f} MB, 占比: {pdf_ratio:.2f}%
  282. - Office文件: {stats['office_files']} 个, {stats['office_size'] / (1024 * 1024):.2f} MB, 占比: {office_ratio:.2f}%
  283. ## 性能分析
  284. - 文本类平均解析速度: {(stats['text_size'] / (1024 * 1024) / total_time):.2f} MB/秒 (如果有文本文件)
  285. - 图片类平均解析速度: {(stats['image_size'] / (1024 * 1024) / total_time):.2f} MB/秒 (如果有图片文件)
  286. - 音频类平均解析速度: {(stats['audio_size'] / (1024 * 1024) / total_time):.2f} MB/秒 (如果有音频文件)
  287. - 视频类平均解析速度: {(stats['video_size'] / (1024 * 1024) / total_time):.2f} MB/秒 (如果有视频文件)
  288. """
  289. return report