api.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729
  1. """MinerU File转Markdown转换的API客户端。"""
  2. import asyncio
  3. import os
  4. import zipfile
  5. from pathlib import Path
  6. from typing import Any, Dict, List, Optional, Union
  7. import aiohttp
  8. import requests
  9. from . import config
  10. def singleton_func(cls):
  11. instance = {}
  12. def _singleton(*args, **kwargs):
  13. if cls not in instance:
  14. instance[cls] = cls(*args, **kwargs)
  15. return instance[cls]
  16. return _singleton
  17. @singleton_func
  18. class MinerUClient:
  19. """
  20. 用于与 MinerU API 交互以将 File 转换为 Markdown 的客户端。
  21. """
  22. def __init__(self, api_base: Optional[str] = None, api_key: Optional[str] = None):
  23. """
  24. 初始化 MinerU API 客户端。
  25. Args:
  26. api_base: MinerU API 的基础 URL (默认: 从环境变量获取)
  27. api_key: 用于向 MinerU 进行身份验证的 API 密钥 (默认: 从环境变量获取)
  28. """
  29. self.api_base = api_base or config.MINERU_API_BASE
  30. self.api_key = api_key or config.MINERU_API_KEY
  31. if not self.api_key:
  32. # 提供更友好的错误消息
  33. raise ValueError(
  34. "错误: MinerU API 密钥 (MINERU_API_KEY) 未设置或为空。\n"
  35. "请确保已设置 MINERU_API_KEY 环境变量,例如:\n"
  36. " export MINERU_API_KEY='your_actual_api_key'\n"
  37. "或者,在项目根目录的 `.env` 文件中定义该变量。"
  38. )
  39. async def _request(self, method: str, endpoint: str, **kwargs) -> Dict[str, Any]:
  40. """
  41. 向 MinerU API 发出请求。
  42. Args:
  43. method: HTTP 方法 (GET, POST 等)
  44. endpoint: API 端点路径 (不含基础 URL)
  45. **kwargs: 传递给 aiohttp 请求的其他参数
  46. Returns:
  47. dict: API 响应 (JSON 格式)
  48. """
  49. url = f"{self.api_base}{endpoint}"
  50. headers = {
  51. "Authorization": f"Bearer {self.api_key}",
  52. "Accept": "application/json",
  53. }
  54. if "headers" in kwargs:
  55. kwargs["headers"].update(headers)
  56. else:
  57. kwargs["headers"] = headers
  58. # 创建一个不包含授权信息的参数副本,用于日志记录
  59. log_kwargs = kwargs.copy()
  60. if "headers" in log_kwargs and "Authorization" in log_kwargs["headers"]:
  61. log_kwargs["headers"] = log_kwargs["headers"].copy()
  62. log_kwargs["headers"]["Authorization"] = "Bearer ****" # 隐藏API密钥
  63. config.logger.debug(f"API请求: {method} {url}")
  64. config.logger.debug(f"请求参数: {log_kwargs}")
  65. async with aiohttp.ClientSession() as session:
  66. async with session.request(method, url, **kwargs) as response:
  67. response.raise_for_status()
  68. response_json = await response.json()
  69. config.logger.debug(f"API响应: {response_json}")
  70. return response_json
  71. async def submit_file_url_task(
  72. self,
  73. urls: Union[str, List[Union[str, Dict[str, Any]]], Dict[str, Any]],
  74. enable_ocr: bool = True,
  75. language: str = "ch",
  76. page_ranges: Optional[str] = None,
  77. ) -> Dict[str, Any]:
  78. """
  79. 提交 File URL 以转换为 Markdown。支持单个URL或多个URL批量处理。
  80. Args:
  81. urls: 可以是以下形式之一:
  82. 1. 单个URL字符串
  83. 2. 多个URL的列表
  84. 3. 包含URL配置的字典列表,每个字典包含:
  85. - url: File文件URL (必需)
  86. - is_ocr: 是否启用OCR (可选)
  87. - data_id: 文件数据ID (可选)
  88. - page_ranges: 页码范围 (可选)
  89. enable_ocr: 是否为转换启用 OCR(所有文件的默认值)
  90. language: 指定文档语言,默认 ch,中文
  91. page_ranges: 指定页码范围,格式为逗号分隔的字符串。例如:"2,4-6"表示选取第2页、第4页至第6页;"2--2"表示从第2页到倒数第2页。
  92. Returns:
  93. dict: 任务信息,包括batch_id
  94. """
  95. # 统计URL数量
  96. url_count = 1
  97. if isinstance(urls, list):
  98. url_count = len(urls)
  99. config.logger.debug(
  100. f"调用submit_file_url_task: {url_count}个URL, "
  101. + f"ocr={enable_ocr}, "
  102. + f"language={language}"
  103. )
  104. # 处理输入,确保我们有一个URL配置列表
  105. urls_config = []
  106. # 转换输入为标准格式
  107. if isinstance(urls, str):
  108. urls_config.append(
  109. {"url": urls, "is_ocr": enable_ocr, "page_ranges": page_ranges}
  110. )
  111. elif isinstance(urls, list):
  112. # 处理URL列表或URL配置列表
  113. for i, url_item in enumerate(urls):
  114. if isinstance(url_item, str):
  115. # 简单的URL字符串
  116. urls_config.append(
  117. {
  118. "url": url_item,
  119. "is_ocr": enable_ocr,
  120. "page_ranges": page_ranges,
  121. }
  122. )
  123. elif isinstance(url_item, dict):
  124. # 含有详细配置的URL字典
  125. if "url" not in url_item:
  126. raise ValueError(f"URL配置必须包含 'url' 字段: {url_item}")
  127. url_is_ocr = url_item.get("is_ocr", enable_ocr)
  128. url_page_ranges = url_item.get("page_ranges", page_ranges)
  129. url_config = {"url": url_item["url"], "is_ocr": url_is_ocr}
  130. if url_page_ranges is not None:
  131. url_config["page_ranges"] = url_page_ranges
  132. urls_config.append(url_config)
  133. else:
  134. raise TypeError(f"不支持的URL配置类型: {type(url_item)}")
  135. elif isinstance(urls, dict):
  136. # 单个URL配置字典
  137. if "url" not in urls:
  138. raise ValueError(f"URL配置必须包含 'url' 字段: {urls}")
  139. url_is_ocr = urls.get("is_ocr", enable_ocr)
  140. url_page_ranges = urls.get("page_ranges", page_ranges)
  141. url_config = {"url": urls["url"], "is_ocr": url_is_ocr}
  142. if url_page_ranges is not None:
  143. url_config["page_ranges"] = url_page_ranges
  144. urls_config.append(url_config)
  145. else:
  146. raise TypeError(f"urls 必须是字符串、列表或字典,而不是 {type(urls)}")
  147. # 构建API请求payload
  148. files_payload = urls_config # 与submit_file_task不同,这里直接使用URLs配置
  149. payload = {
  150. "language": language,
  151. "files": files_payload,
  152. }
  153. # 调用批量API
  154. response = await self._request(
  155. "POST", "/api/v4/extract/task/batch", json=payload
  156. )
  157. # 检查响应
  158. if "data" not in response or "batch_id" not in response["data"]:
  159. raise ValueError(f"提交批量URL任务失败: {response}")
  160. batch_id = response["data"]["batch_id"]
  161. config.logger.info(f"开始处理 {len(urls_config)} 个文件URL")
  162. config.logger.debug(f"批量URL任务提交成功,批次ID: {batch_id}")
  163. # 返回包含batch_id的响应和URLs信息
  164. result = {
  165. "data": {
  166. "batch_id": batch_id,
  167. "uploaded_files": [url_config.get("url") for url_config in urls_config],
  168. }
  169. }
  170. # 对于单个URL的情况,设置file_name以保持与原来返回格式的兼容性
  171. if len(urls_config) == 1:
  172. url = urls_config[0]["url"]
  173. # 从URL中提取文件名
  174. file_name = url.split("/")[-1]
  175. result["data"]["file_name"] = file_name
  176. return result
  177. async def submit_file_task(
  178. self,
  179. files: Union[str, List[Union[str, Dict[str, Any]]], Dict[str, Any]],
  180. enable_ocr: bool = True,
  181. language: str = "ch",
  182. page_ranges: Optional[str] = None,
  183. ) -> Dict[str, Any]:
  184. """
  185. 提交本地 File 文件以转换为 Markdown。支持单个文件路径或多个文件配置。
  186. Args:
  187. files: 可以是以下形式之一:
  188. 1. 单个文件路径字符串
  189. 2. 多个文件路径的列表
  190. 3. 包含文件配置的字典列表,每个字典包含:
  191. - path/name: 文件路径或文件名
  192. - is_ocr: 是否启用OCR (可选)
  193. - data_id: 文件数据ID (可选)
  194. - page_ranges: 页码范围 (可选)
  195. enable_ocr: 是否为转换启用 OCR(所有文件的默认值)
  196. language: 指定文档语言,默认 ch,中文
  197. page_ranges: 指定页码范围,格式为逗号分隔的字符串。例如:"2,4-6"表示选取第2页、第4页至第6页;"2--2"表示从第2页到倒数第2页。
  198. Returns:
  199. dict: 任务信息,包括batch_id
  200. """
  201. # 统计文件数量
  202. file_count = 1
  203. if isinstance(files, list):
  204. file_count = len(files)
  205. config.logger.debug(
  206. f"调用submit_file_task: {file_count}个文件, "
  207. + f"ocr={enable_ocr}, "
  208. + f"language={language}"
  209. )
  210. # 处理输入,确保我们有一个文件配置列表
  211. files_config = []
  212. # 转换输入为标准格式
  213. if isinstance(files, str):
  214. # 单个文件路径
  215. file_path = Path(files)
  216. if not file_path.exists():
  217. raise FileNotFoundError(f"未找到 File 文件: {file_path}")
  218. files_config.append(
  219. {
  220. "path": file_path,
  221. "name": file_path.name,
  222. "is_ocr": enable_ocr,
  223. "page_ranges": page_ranges,
  224. }
  225. )
  226. elif isinstance(files, list):
  227. # 处理文件路径列表或文件配置列表
  228. for i, file_item in enumerate(files):
  229. if isinstance(file_item, str):
  230. # 简单的文件路径
  231. file_path = Path(file_item)
  232. if not file_path.exists():
  233. raise FileNotFoundError(f"未找到 File 文件: {file_path}")
  234. files_config.append(
  235. {
  236. "path": file_path,
  237. "name": file_path.name,
  238. "is_ocr": enable_ocr,
  239. "page_ranges": page_ranges,
  240. }
  241. )
  242. elif isinstance(file_item, dict):
  243. # 含有详细配置的文件字典
  244. if "path" not in file_item and "name" not in file_item:
  245. raise ValueError(
  246. f"文件配置必须包含 'path' 或 'name' 字段: {file_item}"
  247. )
  248. if "path" in file_item:
  249. file_path = Path(file_item["path"])
  250. if not file_path.exists():
  251. raise FileNotFoundError(f"未找到 File 文件: {file_path}")
  252. file_name = file_path.name
  253. else:
  254. file_name = file_item["name"]
  255. file_path = None
  256. file_is_ocr = file_item.get("is_ocr", enable_ocr)
  257. file_page_ranges = file_item.get("page_ranges", page_ranges)
  258. file_config = {
  259. "path": file_path,
  260. "name": file_name,
  261. "is_ocr": file_is_ocr,
  262. }
  263. if file_page_ranges is not None:
  264. file_config["page_ranges"] = file_page_ranges
  265. files_config.append(file_config)
  266. else:
  267. raise TypeError(f"不支持的文件配置类型: {type(file_item)}")
  268. elif isinstance(files, dict):
  269. # 单个文件配置字典
  270. if "path" not in files and "name" not in files:
  271. raise ValueError(f"文件配置必须包含 'path' 或 'name' 字段: {files}")
  272. if "path" in files:
  273. file_path = Path(files["path"])
  274. if not file_path.exists():
  275. raise FileNotFoundError(f"未找到 File 文件: {file_path}")
  276. file_name = file_path.name
  277. else:
  278. file_name = files["name"]
  279. file_path = None
  280. file_is_ocr = files.get("is_ocr", enable_ocr)
  281. file_page_ranges = files.get("page_ranges", page_ranges)
  282. file_config = {
  283. "path": file_path,
  284. "name": file_name,
  285. "is_ocr": file_is_ocr,
  286. }
  287. if file_page_ranges is not None:
  288. file_config["page_ranges"] = file_page_ranges
  289. files_config.append(file_config)
  290. else:
  291. raise TypeError(f"files 必须是字符串、列表或字典,而不是 {type(files)}")
  292. # 步骤1: 构建API请求payload
  293. files_payload = []
  294. for file_config in files_config:
  295. file_payload = {
  296. "name": file_config["name"],
  297. "is_ocr": file_config["is_ocr"],
  298. }
  299. if "page_ranges" in file_config and file_config["page_ranges"] is not None:
  300. file_payload["page_ranges"] = file_config["page_ranges"]
  301. files_payload.append(file_payload)
  302. payload = {
  303. "language": language,
  304. "files": files_payload,
  305. }
  306. # 步骤2: 获取文件上传URL
  307. response = await self._request("POST", "/api/v4/file-urls/batch", json=payload)
  308. # 检查响应
  309. if (
  310. "data" not in response
  311. or "batch_id" not in response["data"]
  312. or "file_urls" not in response["data"]
  313. ):
  314. raise ValueError(f"获取上传URL失败: {response}")
  315. batch_id = response["data"]["batch_id"]
  316. file_urls = response["data"]["file_urls"]
  317. if len(file_urls) != len(files_config):
  318. raise ValueError(
  319. f"上传URL数量 ({len(file_urls)}) 与文件数量 ({len(files_config)}) 不匹配"
  320. )
  321. config.logger.info(f"开始上传 {len(file_urls)} 个本地文件")
  322. config.logger.debug(f"获取上传URL成功,批次ID: {batch_id}")
  323. # 步骤3: 上传所有文件
  324. uploaded_files = []
  325. for i, (file_config, upload_url) in enumerate(zip(files_config, file_urls)):
  326. file_path = file_config["path"]
  327. if file_path is None:
  328. raise ValueError(f"文件 {file_config['name']} 没有有效的路径")
  329. try:
  330. with open(file_path, "rb") as f:
  331. # 重要:不设置Content-Type,让OSS自动处理
  332. response = requests.put(upload_url, data=f)
  333. if response.status_code != 200:
  334. raise ValueError(
  335. f"文件上传失败,状态码: {response.status_code}, 响应: {response.text}"
  336. )
  337. config.logger.debug(f"文件 {file_path.name} 上传成功")
  338. uploaded_files.append(file_path.name)
  339. except Exception as e:
  340. raise ValueError(f"文件 {file_path.name} 上传失败: {str(e)}")
  341. config.logger.info(f"文件上传完成,共 {len(uploaded_files)} 个文件")
  342. # 返回包含batch_id的响应和已上传的文件信息
  343. result = {"data": {"batch_id": batch_id, "uploaded_files": uploaded_files}}
  344. # 对于单个文件的情况,保持与原来返回格式的兼容性
  345. if len(uploaded_files) == 1:
  346. result["data"]["file_name"] = uploaded_files[0]
  347. return result
  348. async def get_batch_task_status(self, batch_id: str) -> Dict[str, Any]:
  349. """
  350. 获取批量转换任务的状态。
  351. Args:
  352. batch_id: 批量任务的ID
  353. Returns:
  354. dict: 批量任务状态信息
  355. """
  356. response = await self._request(
  357. "GET", f"/api/v4/extract-results/batch/{batch_id}"
  358. )
  359. return response
  360. async def process_file_to_markdown(
  361. self,
  362. task_fn,
  363. task_arg: Union[str, List[Dict[str, Any]], Dict[str, Any]],
  364. enable_ocr: bool = True,
  365. output_dir: Optional[str] = None,
  366. max_retries: int = 180,
  367. retry_interval: int = 10,
  368. ) -> Union[str, Dict[str, Any]]:
  369. """
  370. 从开始到结束处理 File 到 Markdown 的转换。
  371. Args:
  372. task_fn: 提交任务的函数 (submit_file_url_task 或 submit_file_task)
  373. task_arg: 任务函数的参数,可以是:
  374. - URL字符串
  375. - 文件路径字符串
  376. - 包含文件配置的字典
  377. - 包含多个文件配置的字典列表
  378. enable_ocr: 是否启用 OCR
  379. output_dir: 结果的输出目录
  380. max_retries: 最大状态检查重试次数
  381. retry_interval: 状态检查之间的时间间隔 (秒)
  382. Returns:
  383. Union[str, Dict[str, Any]]:
  384. - 单文件: 包含提取的 Markdown 文件的目录路径
  385. - 多文件: {
  386. "results": [
  387. {
  388. "filename": str,
  389. "status": str,
  390. "content": str,
  391. "error_message": str,
  392. }
  393. ],
  394. "extract_dir": str
  395. }
  396. """
  397. try:
  398. # 提交任务 - 使用位置参数调用,而不是命名参数
  399. task_info = await task_fn(task_arg, enable_ocr)
  400. # 批量任务处理
  401. batch_id = task_info["data"]["batch_id"]
  402. # 获取所有上传文件的名称
  403. uploaded_files = task_info["data"].get("uploaded_files", [])
  404. if not uploaded_files and "file_name" in task_info["data"]:
  405. uploaded_files = [task_info["data"]["file_name"]]
  406. if not uploaded_files:
  407. raise ValueError("无法获取上传文件的信息")
  408. config.logger.debug(f"批量任务提交成功。Batch ID: {batch_id}")
  409. # 跟踪所有文件的处理状态
  410. files_status = {} # 将使用file_name作为键
  411. files_download_urls = {}
  412. failed_files = {} # 记录失败的文件和错误信息
  413. # 准备输出路径
  414. output_path = config.ensure_output_dir(output_dir)
  415. # 轮询任务完成情况
  416. for i in range(max_retries):
  417. status_info = await self.get_batch_task_status(batch_id)
  418. config.logger.debug(f"轮训结果:{status_info}")
  419. if (
  420. "data" not in status_info
  421. or "extract_result" not in status_info["data"]
  422. ):
  423. config.logger.error(f"获取批量任务状态失败: {status_info}")
  424. await asyncio.sleep(retry_interval)
  425. continue
  426. # 检查所有文件的状态
  427. all_done = True
  428. has_progress = False
  429. for result in status_info["data"]["extract_result"]:
  430. file_name = result.get("file_name")
  431. if not file_name:
  432. continue
  433. # 初始化状态,如果之前没有记录
  434. if file_name not in files_status:
  435. files_status[file_name] = "pending"
  436. state = result.get("state")
  437. files_status[file_name] = state
  438. if state == "done":
  439. # 保存下载链接
  440. full_zip_url = result.get("full_zip_url")
  441. if full_zip_url:
  442. files_download_urls[file_name] = full_zip_url
  443. config.logger.info(f"文件 {file_name} 处理完成")
  444. else:
  445. config.logger.debug(
  446. f"文件 {file_name} 标记为完成但没有下载链接"
  447. )
  448. all_done = False
  449. elif state in ["failed", "error"]:
  450. err_msg = result.get("err_msg", "未知错误")
  451. failed_files[file_name] = err_msg
  452. config.logger.warning(f"文件 {file_name} 处理失败: {err_msg}")
  453. # 不抛出异常,继续处理其他文件
  454. else:
  455. all_done = False
  456. # 显示进度信息
  457. if state == "running" and "extract_progress" in result:
  458. has_progress = True
  459. progress = result["extract_progress"]
  460. extracted = progress.get("extracted_pages", 0)
  461. total = progress.get("total_pages", 0)
  462. if total > 0:
  463. percent = (extracted / total) * 100
  464. config.logger.info(
  465. f"处理进度: {file_name} "
  466. + f"{extracted}/{total} 页 "
  467. + f"({percent:.1f}%)"
  468. )
  469. # 检查是否所有文件都已经处理完成
  470. expected_file_count = len(uploaded_files)
  471. processed_file_count = len(files_status)
  472. completed_file_count = len(files_download_urls) + len(failed_files)
  473. # 记录当前状态
  474. config.logger.debug(
  475. f"文件处理状态: all_done={all_done}, "
  476. + f"files_status数量={processed_file_count}, "
  477. + f"上传文件数量={expected_file_count}, "
  478. + f"下载链接数量={len(files_download_urls)}, "
  479. + f"失败文件数量={len(failed_files)}"
  480. )
  481. # 判断是否所有文件都已完成(包括成功和失败的)
  482. if (
  483. processed_file_count > 0
  484. and processed_file_count >= expected_file_count
  485. and completed_file_count >= processed_file_count
  486. ):
  487. if files_download_urls or failed_files:
  488. config.logger.info("文件处理完成")
  489. if failed_files:
  490. config.logger.warning(
  491. f"有 {len(failed_files)} 个文件处理失败"
  492. )
  493. break
  494. else:
  495. # 这种情况不应该发生,但保险起见
  496. all_done = False
  497. # 如果没有进度信息,只显示简单的等待消息
  498. if not has_progress:
  499. config.logger.info(f"等待文件处理完成... ({i+1}/{max_retries})")
  500. await asyncio.sleep(retry_interval)
  501. else:
  502. # 如果超过最大重试次数,检查是否有部分文件完成
  503. if not files_download_urls and not failed_files:
  504. raise TimeoutError(f"批量任务 {batch_id} 未在允许的时间内完成")
  505. else:
  506. config.logger.warning(
  507. "警告: 部分文件未在允许的时间内完成," + "继续处理已完成的文件"
  508. )
  509. # 创建主提取目录
  510. extract_dir = output_path / batch_id
  511. extract_dir.mkdir(exist_ok=True)
  512. # 准备结果列表
  513. results = []
  514. # 下载并解压每个成功的文件的结果
  515. for file_name, download_url in files_download_urls.items():
  516. try:
  517. config.logger.debug
  518. (f"下载文件处理结果: {file_name}")
  519. # 从下载URL中提取zip文件名作为子目录名
  520. zip_file_name = download_url.split("/")[-1]
  521. # 去掉.zip扩展名
  522. zip_dir_name = os.path.splitext(zip_file_name)[0]
  523. file_extract_dir = extract_dir / zip_dir_name
  524. file_extract_dir.mkdir(exist_ok=True)
  525. # 下载ZIP文件
  526. zip_path = output_path / f"{batch_id}_{zip_file_name}"
  527. async with aiohttp.ClientSession() as session:
  528. async with session.get(
  529. download_url,
  530. headers={"Authorization": f"Bearer {self.api_key}"},
  531. ) as response:
  532. response.raise_for_status()
  533. with open(zip_path, "wb") as f:
  534. f.write(await response.read())
  535. # 解压到子文件夹
  536. with zipfile.ZipFile(zip_path, "r") as zip_ref:
  537. zip_ref.extractall(file_extract_dir)
  538. # 解压后删除ZIP文件
  539. zip_path.unlink()
  540. # 尝试读取Markdown内容
  541. markdown_content = ""
  542. markdown_files = list(file_extract_dir.glob("*.md"))
  543. if markdown_files:
  544. with open(markdown_files[0], "r", encoding="utf-8") as f:
  545. markdown_content = f.read()
  546. # 添加成功结果
  547. results.append(
  548. {
  549. "filename": file_name,
  550. "status": "success",
  551. "content": markdown_content,
  552. "extract_path": str(file_extract_dir),
  553. }
  554. )
  555. config.logger.debug(
  556. f"文件 {file_name} 的结果已解压到: {file_extract_dir}"
  557. )
  558. except Exception as e:
  559. # 下载失败,添加错误结果
  560. error_msg = f"下载结果失败: {str(e)}"
  561. config.logger.error(f"文件 {file_name} {error_msg}")
  562. results.append(
  563. {
  564. "filename": file_name,
  565. "status": "error",
  566. "error_message": error_msg,
  567. }
  568. )
  569. # 添加处理失败的文件到结果
  570. for file_name, error_msg in failed_files.items():
  571. results.append(
  572. {
  573. "filename": file_name,
  574. "status": "error",
  575. "error_message": f"处理失败: {error_msg}",
  576. }
  577. )
  578. # 输出处理结果统计
  579. success_count = len(files_download_urls)
  580. fail_count = len(failed_files)
  581. total_count = success_count + fail_count
  582. config.logger.info("\n=== 文件处理结果统计 ===")
  583. config.logger.info(f"总文件数: {total_count}")
  584. config.logger.info(f"成功处理: {success_count}")
  585. config.logger.info(f"处理失败: {fail_count}")
  586. if failed_files:
  587. config.logger.info("\n失败文件详情:")
  588. for file_name, error_msg in failed_files.items():
  589. config.logger.info(f" - {file_name}: {error_msg}")
  590. if success_count > 0:
  591. config.logger.info(f"\n结果保存目录: {extract_dir}")
  592. else:
  593. config.logger.info(f"\n输出目录: {extract_dir}")
  594. # 返回详细结果
  595. return {
  596. "results": results,
  597. "extract_dir": str(extract_dir),
  598. "success_count": success_count,
  599. "fail_count": fail_count,
  600. "total_count": total_count,
  601. }
  602. except Exception as e:
  603. config.logger.error(f"处理 File 到 Markdown 失败: {str(e)}")
  604. raise