native_parser.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282
  1. from core.router import Parser
  2. from models.result import ParseResult
  3. from utils.logger import log
  4. import fitz # PyMuPDF
  5. from docx import Document
  6. import openpyxl
  7. from pptx import Presentation
  8. import os
  9. class NativeDocParser(Parser):
  10. """原生文档解析器,处理Office文档和原生PDF"""
  11. async def parse(self, file_path: str) -> ParseResult:
  12. """
  13. 解析原生文档
  14. Args:
  15. file_path: 文件路径
  16. Returns:
  17. ParseResult: 解析结果
  18. """
  19. log.info(f"开始解析原生文档: {file_path}")
  20. try:
  21. # 根据文件扩展名判断文件类型
  22. ext = os.path.splitext(file_path)[1].lower()
  23. if ext == '.pdf':
  24. return await self._parse_pdf(file_path)
  25. elif ext == '.docx':
  26. return await self._parse_docx(file_path)
  27. elif ext == '.doc':
  28. return await self._parse_doc(file_path)
  29. elif ext == '.xlsx':
  30. return await self._parse_xlsx(file_path)
  31. elif ext == '.pptx':
  32. return await self._parse_pptx(file_path)
  33. else:
  34. raise Exception(f"不支持的文件扩展名: {ext}")
  35. except Exception as e:
  36. log.error(f"原生文档解析失败: {str(e)}")
  37. return ParseResult(
  38. content="",
  39. metadata={"error": str(e)},
  40. file_type="unknown"
  41. )
  42. async def _parse_pdf(self, file_path: str) -> ParseResult:
  43. """
  44. 解析PDF文件
  45. Args:
  46. file_path: 文件路径
  47. Returns:
  48. ParseResult: 解析结果
  49. """
  50. doc = fitz.open(file_path)
  51. content = []
  52. tables = []
  53. page_count = len(doc)
  54. # 遍历所有页面
  55. for page_num in range(page_count):
  56. page = doc[page_num]
  57. # 提取文本
  58. text = page.get_text()
  59. content.append(f"# 第{page_num + 1}页\n{text}")
  60. # 提取表格(PyMuPDF的表格提取功能有限)
  61. # 这里可以根据需要使用更高级的表格提取库
  62. doc.close()
  63. return ParseResult(
  64. content="\n\n".join(content),
  65. metadata={
  66. "page_count": page_count,
  67. "file_size": os.path.getsize(file_path)
  68. },
  69. file_type="pdf",
  70. tables=tables
  71. )
  72. async def _parse_docx(self, file_path: str) -> ParseResult:
  73. """
  74. 解析Word文档
  75. Args:
  76. file_path: 文件路径
  77. Returns:
  78. ParseResult: 解析结果
  79. """
  80. doc = Document(file_path)
  81. content = []
  82. tables = []
  83. # 提取标题和正文
  84. for para in doc.paragraphs:
  85. if para.style.name.startswith('Heading'):
  86. # 根据标题级别添加Markdown标题
  87. level = int(para.style.name.split(' ')[1])
  88. content.append(f"{'#' * level} {para.text}")
  89. else:
  90. content.append(para.text)
  91. # 提取表格
  92. for table_idx, table in enumerate(doc.tables):
  93. table_content = []
  94. table_data = []
  95. # 提取表头
  96. header_cells = table.rows[0].cells
  97. header = [cell.text.strip() for cell in header_cells]
  98. table_content.append('| ' + ' | '.join(header) + ' |')
  99. table_content.append('| ' + ' | '.join(['---'] * len(header)) + ' |')
  100. table_data.append(header)
  101. # 提取表格内容
  102. for row in table.rows[1:]:
  103. cells = row.cells
  104. row_data = [cell.text.strip() for cell in cells]
  105. table_content.append('| ' + ' | '.join(row_data) + ' |')
  106. table_data.append(row_data)
  107. content.append('\n'.join(table_content))
  108. tables.append({
  109. "table_id": table_idx,
  110. "data": table_data
  111. })
  112. return ParseResult(
  113. content="\n".join(content),
  114. metadata={
  115. "paragraph_count": len(doc.paragraphs),
  116. "table_count": len(doc.tables),
  117. "file_size": os.path.getsize(file_path)
  118. },
  119. file_type="docx",
  120. tables=tables
  121. )
  122. async def _parse_xlsx(self, file_path: str) -> ParseResult:
  123. """
  124. 解析Excel文档
  125. Args:
  126. file_path: 文件路径
  127. Returns:
  128. ParseResult: 解析结果
  129. """
  130. wb = openpyxl.load_workbook(file_path)
  131. content = []
  132. tables = []
  133. # 遍历所有工作表
  134. for sheet_idx, sheet_name in enumerate(wb.sheetnames):
  135. sheet = wb[sheet_name]
  136. content.append(f"# 工作表: {sheet_name}")
  137. # 提取表格数据
  138. table_data = []
  139. max_row = sheet.max_row
  140. max_col = sheet.max_column
  141. # 提取表头
  142. header = []
  143. for col in range(1, max_col + 1):
  144. cell_value = sheet.cell(row=1, column=col).value
  145. header.append(str(cell_value) if cell_value else '')
  146. table_data.append(header)
  147. # 提取表格内容
  148. for row in range(2, max_row + 1):
  149. row_data = []
  150. for col in range(1, max_col + 1):
  151. cell_value = sheet.cell(row=row, column=col).value
  152. row_data.append(str(cell_value) if cell_value else '')
  153. table_data.append(row_data)
  154. # 转换为Markdown表格
  155. if header:
  156. markdown_table = []
  157. markdown_table.append('| ' + ' | '.join(header) + ' |')
  158. markdown_table.append('| ' + ' | '.join(['---'] * len(header)) + ' |')
  159. for row_data in table_data[1:]:
  160. markdown_table.append('| ' + ' | '.join(row_data) + ' |')
  161. content.append('\n'.join(markdown_table))
  162. tables.append({
  163. "sheet_name": sheet_name,
  164. "data": table_data
  165. })
  166. wb.close()
  167. return ParseResult(
  168. content="\n\n".join(content),
  169. metadata={
  170. "sheet_count": len(wb.sheetnames),
  171. "file_size": os.path.getsize(file_path)
  172. },
  173. file_type="xlsx",
  174. tables=tables
  175. )
  176. async def _parse_pptx(self, file_path: str) -> ParseResult:
  177. """
  178. 解析PPT文档
  179. Args:
  180. file_path: 文件路径
  181. Returns:
  182. ParseResult: 解析结果
  183. """
  184. prs = Presentation(file_path)
  185. content = []
  186. # 遍历所有幻灯片
  187. for slide_idx, slide in enumerate(prs.slides):
  188. content.append(f"# 幻灯片 {slide_idx + 1}")
  189. # 提取标题
  190. for shape in slide.shapes:
  191. if hasattr(shape, 'text_frame') and shape.text_frame.text:
  192. if shape == slide.shapes[0]: # 假设第一个形状是标题
  193. content.append(f"## {shape.text_frame.text}")
  194. else:
  195. content.append(shape.text_frame.text)
  196. # 提取备注
  197. if slide.notes_slide:
  198. notes = slide.notes_slide.notes_text_frame.text
  199. if notes:
  200. content.append(f"### 备注\n{notes}")
  201. return ParseResult(
  202. content="\n\n".join(content),
  203. metadata={
  204. "slide_count": len(prs.slides),
  205. "file_size": os.path.getsize(file_path)
  206. },
  207. file_type="pptx"
  208. )
  209. async def _parse_doc(self, file_path: str) -> ParseResult:
  210. """
  211. 解析.doc文件
  212. Args:
  213. file_path: 文件路径
  214. Returns:
  215. ParseResult: 解析结果
  216. """
  217. # 使用antiword提取.doc文件内容
  218. import subprocess
  219. try:
  220. result = subprocess.run(
  221. ['antiword', file_path],
  222. capture_output=True,
  223. text=True,
  224. check=True
  225. )
  226. text = result.stdout
  227. except Exception as e:
  228. log.error(f"antiword解析失败: {str(e)}")
  229. raise Exception(f"antiword解析失败: {str(e)}")
  230. content = [text]
  231. return ParseResult(
  232. content="\n".join(content),
  233. metadata={
  234. "file_size": os.path.getsize(file_path)
  235. },
  236. file_type="doc"
  237. )