video_parser.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. from core.router import Parser
  2. from models.result import ParseResult
  3. from utils.logger import log
  4. from utils.ffmpeg_wrapper import FFmpegWrapper
  5. import os
  6. import tempfile
  7. import base64
  8. import requests
  9. from parsers.audio_parser import AudioParser
  10. class VideoParser(Parser):
  11. """视频文件解析器"""
  12. def __init__(self):
  13. self.ffmpeg = FFmpegWrapper()
  14. self.audio_parser = AudioParser()
  15. # Qwen3-VL模型配置
  16. self.qwen_api_url = "http://10.192.72.13:7280/v1/chat/completions"
  17. async def parse(self, file_path: str) -> ParseResult:
  18. """
  19. 解析视频文件
  20. Args:
  21. file_path: 文件路径
  22. Returns:
  23. ParseResult: 解析结果
  24. """
  25. log.info(f"开始解析视频文件: {file_path}")
  26. try:
  27. # 1. 提取音频轨道
  28. with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
  29. temp_audio_path = temp_file.name
  30. self.ffmpeg.extract_audio(file_path, temp_audio_path)
  31. log.info(f"音频提取完成: {temp_audio_path}")
  32. # 2. 使用AudioParser解析音频
  33. audio_result = await self.audio_parser.parse(temp_audio_path)
  34. log.info("音频解析完成")
  35. # 3. 提取关键帧
  36. frame_results = [] # 移到外部定义
  37. with tempfile.TemporaryDirectory() as temp_dir:
  38. # 使用固定频率先提取帧,再通过帧差法筛选关键帧
  39. interval_seconds = 10
  40. diff_threshold = 15.0
  41. keyframes = self.ffmpeg.extract_keyframes(file_path, temp_dir, interval=interval_seconds, diff_threshold=diff_threshold)
  42. log.info(f"关键帧提取完成(帧差阈值={diff_threshold}),共{len(keyframes)}张")
  43. # 4. 使用Qwen3-VL解析关键帧;根据帧文件名计算时间点
  44. for idx, frame_path in enumerate(keyframes):
  45. try:
  46. frame_content = self._parse_frame_with_qwen(frame_path)
  47. log.info(f"解析关键帧 {idx+1} 结果长度: {len(frame_content) if frame_content else 0}")
  48. if frame_content:
  49. # 从文件名解析帧序号,filename like frame_000001.jpg
  50. try:
  51. base = os.path.basename(frame_path)
  52. num_part = base.split('_')[1].split('.')[0]
  53. frame_index = int(num_part)
  54. time_second = (frame_index - 1) * interval_seconds
  55. except Exception:
  56. time_second = idx * interval_seconds
  57. frame_results.append((time_second, frame_content))
  58. log.info(f"添加关键帧 到结果列表,时间:{time_second}s")
  59. else:
  60. log.warning(f"关键帧 {idx+1} 解析结果为空")
  61. except Exception as e:
  62. log.warning(f"解析关键帧 {idx+1} 失败: {str(e)}")
  63. log.info(f"关键帧解析完成,frame_results长度: {len(frame_results)}")
  64. # 5. 合并结果
  65. content = []
  66. content.append("# 音频内容")
  67. content.append(audio_result.content)
  68. if frame_results:
  69. log.info("开始添加画面内容到结果")
  70. content.append("\n# 画面内容")
  71. for time_second, frame_content in frame_results:
  72. content.append(f"\n## 第{time_second}秒")
  73. content.append(frame_content)
  74. log.info(f"添加第{time_second}秒画面内容,长度: {len(frame_content)}")
  75. else:
  76. log.warning("没有画面内容可以添加")
  77. # 清理临时文件
  78. if os.path.exists(temp_audio_path):
  79. os.remove(temp_audio_path)
  80. return ParseResult(
  81. content="\n".join(content),
  82. metadata={
  83. "parser": "VideoParser",
  84. "file_size": os.path.getsize(file_path),
  85. "audio_parser": "Qwen3-ASR",
  86. "visual_parser": "Qwen3-VL",
  87. "keyframe_count": len(keyframes)
  88. },
  89. file_type="video"
  90. )
  91. except Exception as e:
  92. log.error(f"视频文件解析失败: {str(e)}")
  93. # 清理临时文件
  94. if 'temp_audio_path' in locals() and os.path.exists(temp_audio_path):
  95. os.remove(temp_audio_path)
  96. return ParseResult(
  97. content="",
  98. metadata={"error": str(e)},
  99. file_type="video"
  100. )
  101. def _parse_frame_with_qwen(self, image_path: str) -> str:
  102. """
  103. 使用Qwen3-VL模型解析图片
  104. Args:
  105. image_path: 图片路径
  106. Returns:
  107. str: 解析结果
  108. """
  109. log.info(f"使用Qwen3-VL解析图片: {image_path}")
  110. # 编码图片
  111. with open(image_path, "rb") as f:
  112. base64_image = base64.b64encode(f.read()).decode("utf-8")
  113. # 发送请求
  114. payload = {
  115. "model": "/model",
  116. "messages": [{
  117. "role": "user",
  118. "content": [
  119. {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}},
  120. {"type": "text", "text": "详细描述这张图片的内容,包括人物、物体、场景、文字等所有可见信息"}
  121. ]
  122. }],
  123. "max_tokens": 512
  124. }
  125. response = requests.post(self.qwen_api_url, json=payload, timeout=120)
  126. response.raise_for_status()
  127. result = response.json()
  128. return result["choices"][0]["message"]["content"]