pdf_utils.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713
  1. """
  2. PDF处理工具模块
  3. 提供PDF相关处理功能:
  4. - PDF加载与分类
  5. - PDF文本提取
  6. - 跨页表格合并
  7. - 页面范围解析与过滤
  8. """
  9. from typing import Dict, List, Any, Optional, Tuple, Set
  10. from pathlib import Path
  11. from PIL import Image
  12. from loguru import logger
  13. import re
  14. # 导入页面范围解析函数(不依赖 MinerU)
  15. from .file_utils import parse_page_range
  16. # 导入 MinerU 组件
  17. try:
  18. from mineru.utils.pdf_classify import classify as pdf_classify
  19. from mineru.utils.pdf_image_tools import load_images_from_pdf
  20. from mineru.utils.enum_class import ImageType
  21. from mineru.utils.pdf_text_tool import get_page as pdf_get_page_text
  22. MINERU_AVAILABLE = True
  23. except ImportError:
  24. raise ImportError("MinerU components not available for PDF processing")
  25. class PDFUtils:
  26. """PDF处理工具类"""
  27. @staticmethod
  28. def parse_page_range(page_range: Optional[str], total_pages: int) -> Set[int]:
  29. """
  30. 解析页面范围字符串(向后兼容包装函数)
  31. 此方法是对 file_utils.parse_page_range 的包装,保持向后兼容性。
  32. 新代码应直接使用 file_utils.parse_page_range。
  33. 支持格式:
  34. - "1-5" → {0, 1, 2, 3, 4}(页码从1开始,内部转为0-based索引)
  35. - "3" → {2}
  36. - "1-5,7,9-12" → {0, 1, 2, 3, 4, 6, 8, 9, 10, 11}
  37. - "1-" → 从第1页到最后
  38. - "-5" → 从第1页到第5页
  39. Args:
  40. page_range: 页面范围字符串(页码从1开始)
  41. total_pages: 总页数
  42. Returns:
  43. 页面索引集合(0-based)
  44. """
  45. return parse_page_range(page_range, total_pages)
  46. @staticmethod
  47. def _detect_pdf_doc_type(pdf_doc: Any) -> str:
  48. """
  49. 检测 PDF 文档对象类型
  50. Args:
  51. pdf_doc: PDF 文档对象
  52. Returns:
  53. 'pypdfium2' 或 'fitz'
  54. """
  55. doc_type_name = type(pdf_doc).__name__
  56. doc_module = type(pdf_doc).__module__
  57. if 'pdfium' in doc_module.lower() or 'PdfDocument' in doc_type_name:
  58. return 'pypdfium2'
  59. elif 'fitz' in doc_module.lower() or 'Document' in doc_type_name:
  60. return 'fitz'
  61. else:
  62. # 尝试通过属性判断
  63. if hasattr(pdf_doc, 'get_page') or hasattr(pdf_doc, 'page_count'):
  64. # fitz.Document 有 page_count 属性
  65. return 'fitz'
  66. else:
  67. # pypdfium2 通过索引访问
  68. return 'pypdfium2'
  69. @staticmethod
  70. def load_and_classify_document(
  71. document_path: Path,
  72. dpi: int = 200,
  73. page_range: Optional[str] = None,
  74. renderer: str = "fitz" # 新增参数,默认 fitz
  75. ) -> Tuple[List[Dict], str, Optional[Any], str]:
  76. """
  77. 加载文档并分类,支持页面范围过滤
  78. Args:
  79. document_path: 文档路径
  80. dpi: PDF渲染DPI
  81. page_range: 页面范围字符串,如 "1-5,7,9-12"
  82. - PDF:按页码(从1开始)
  83. - 图片目录:按文件名排序后的位置(从1开始)
  84. renderer: PDF渲染引擎,"fitz" 或 "pypdfium2"
  85. Returns:
  86. (images_list, pdf_type, pdf_doc)
  87. - images_list: 图像列表,每个元素包含 {'img_pil': PIL.Image, 'scale': float, 'page_idx': int}
  88. - pdf_type: 'ocr' 或 'txt'
  89. - pdf_doc: PDF文档对象(如果PDF)
  90. - renderer_used: 实际使用的渲染器类型
  91. """
  92. pdf_doc = None
  93. pdf_type = 'ocr' # 默认使用OCR模式
  94. all_images = []
  95. if document_path.is_dir():
  96. # 处理目录:遍历所有图片
  97. image_extensions = {'.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif'}
  98. image_files = sorted([
  99. f for f in document_path.iterdir()
  100. if f.suffix.lower() in image_extensions
  101. ])
  102. # 解析页面范围
  103. total_pages = len(image_files)
  104. selected_pages = parse_page_range(page_range, total_pages)
  105. if page_range:
  106. logger.info(f"📋 图片目录共 {total_pages} 张,选择处理 {len(selected_pages)} 张")
  107. for idx, img_file in enumerate(image_files):
  108. if idx not in selected_pages:
  109. continue
  110. img = Image.open(img_file)
  111. if img.mode != 'RGB':
  112. img = img.convert('RGB')
  113. all_images.append({
  114. 'img_pil': img,
  115. 'scale': 1.0,
  116. 'source_path': str(img_file),
  117. 'page_idx': idx, # 原始索引
  118. 'page_name': img_file.stem # 文件名(不含扩展名)
  119. })
  120. pdf_type = 'ocr' # 图片目录始终使用OCR模式
  121. elif document_path.suffix.lower() == '.pdf':
  122. # 处理PDF文件
  123. if not MINERU_AVAILABLE:
  124. raise RuntimeError("MinerU components not available for PDF processing")
  125. with open(document_path, 'rb') as f:
  126. pdf_bytes = f.read()
  127. # PDF分类
  128. pdf_type = pdf_classify(pdf_bytes)
  129. logger.info(f"📋 PDF classified as: {pdf_type}")
  130. # 加载图像
  131. images_list, pdf_doc = load_images_from_pdf_unified(
  132. pdf_bytes,
  133. dpi=dpi,
  134. image_type=ImageType.PIL,
  135. renderer=renderer # 使用指定的渲染引擎
  136. )
  137. # 解析页面范围
  138. total_pages = len(images_list)
  139. selected_pages = parse_page_range(page_range, total_pages)
  140. if page_range:
  141. logger.info(f"📋 PDF 共 {total_pages} 页,选择处理 {len(selected_pages)} 页")
  142. for idx, img_dict in enumerate(images_list):
  143. if idx not in selected_pages:
  144. continue
  145. all_images.append({
  146. 'img_pil': img_dict['img_pil'],
  147. 'scale': img_dict.get('scale', dpi / 72),
  148. 'source_path': str(document_path),
  149. 'page_idx': idx, # 原始页码索引
  150. 'page_name': f"{document_path.stem}_page_{idx + 1:03d}"
  151. })
  152. elif document_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif']:
  153. # 处理单个图片
  154. img = Image.open(document_path)
  155. if img.mode != 'RGB':
  156. img = img.convert('RGB')
  157. all_images.append({
  158. 'img_pil': img,
  159. 'scale': 1.0,
  160. 'source_path': str(document_path),
  161. 'page_idx': 0,
  162. 'page_name': document_path.stem
  163. })
  164. pdf_type = 'ocr'
  165. else:
  166. raise ValueError(f"Unsupported file format: {document_path.suffix}")
  167. return all_images, pdf_type, pdf_doc, renderer
  168. @staticmethod
  169. def extract_text_from_pdf(
  170. pdf_doc: Any,
  171. page_idx: int,
  172. bbox: List[float],
  173. scale: float
  174. ) -> Tuple[str, bool]:
  175. """
  176. 从PDF直接提取文本(支持 pypdfium2 和 fitz)
  177. Args:
  178. pdf_doc: PDF文档对象 (pypdfium2.PdfDocument 或 fitz.Document)
  179. page_idx: 页码索引
  180. bbox: 目标区域的bbox(图像坐标)
  181. scale: 图像与PDF的缩放比例
  182. Returns:
  183. (text, success)
  184. """
  185. # 检测 PDF 文档类型
  186. doc_type = PDFUtils._detect_pdf_doc_type(pdf_doc)
  187. if doc_type == 'fitz':
  188. return PDFUtils._extract_text_from_pdf_fitz(pdf_doc, page_idx, bbox, scale)
  189. else: # pypdfium2
  190. return PDFUtils._extract_text_from_pdf_pypdfium2(pdf_doc, page_idx, bbox, scale)
  191. @staticmethod
  192. def _extract_text_from_pdf_pypdfium2(
  193. pdf_doc: Any,
  194. page_idx: int,
  195. bbox: List[float],
  196. scale: float
  197. ) -> Tuple[str, bool]:
  198. """使用 pypdfium2 提取文本(原有实现)"""
  199. if not MINERU_AVAILABLE or pdf_get_page_text is None:
  200. logger.error("MinerU pdf_text_tool not available")
  201. return "", False
  202. try:
  203. page = pdf_doc[page_idx]
  204. # 将图像坐标转换为PDF坐标
  205. pdf_bbox = [
  206. bbox[0] / scale,
  207. bbox[1] / scale,
  208. bbox[2] / scale,
  209. bbox[3] / scale
  210. ]
  211. # 使用 MinerU 的方式获取页面文本信息
  212. page_dict = pdf_get_page_text(page)
  213. # 从 blocks 中提取与 bbox 重叠的文本
  214. text_parts = []
  215. for block in page_dict.get('blocks', []):
  216. for line in block.get('lines', []):
  217. line_bbox = line.get('bbox')
  218. if line_bbox and hasattr(line_bbox, 'bbox'):
  219. line_bbox = line_bbox.bbox
  220. elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
  221. line_bbox = list(line_bbox)
  222. else:
  223. continue
  224. if PDFUtils._bbox_overlap(pdf_bbox, line_bbox):
  225. for span in line.get('spans', []):
  226. span_text = span.get('text', '')
  227. if span_text:
  228. text_parts.append(span_text)
  229. text = ' '.join(text_parts)
  230. return text.strip(), bool(text.strip())
  231. except Exception as e:
  232. import traceback
  233. logger.debug(f"pypdfium2 text extraction error: {e}")
  234. logger.debug(traceback.format_exc())
  235. return "", False
  236. @staticmethod
  237. def _extract_text_from_pdf_fitz(
  238. pdf_doc: Any,
  239. page_idx: int,
  240. bbox: List[float],
  241. scale: float
  242. ) -> Tuple[str, bool]:
  243. """使用 fitz 提取文本"""
  244. try:
  245. import fitz
  246. except ImportError:
  247. logger.error("PyMuPDF (fitz) not available")
  248. return "", False
  249. try:
  250. page = pdf_doc[page_idx]
  251. # 将图像坐标转换为PDF坐标
  252. pdf_bbox = fitz.Rect(
  253. bbox[0] / scale,
  254. bbox[1] / scale,
  255. bbox[2] / scale,
  256. bbox[3] / scale
  257. )
  258. # 提取区域内的文本
  259. text = page.get_text("text", clip=pdf_bbox)
  260. return text.strip(), bool(text.strip())
  261. except Exception as e:
  262. import traceback
  263. logger.debug(f"fitz text extraction error: {e}")
  264. logger.debug(traceback.format_exc())
  265. return "", False
  266. @staticmethod
  267. def extract_all_text_blocks(
  268. pdf_doc: Any,
  269. page_idx: int,
  270. scale: float
  271. ) -> List[Dict[str, Any]]:
  272. """
  273. 提取页面所有文本块(支持 pypdfium2 和 fitz)
  274. Args:
  275. pdf_doc: PDF文档对象
  276. page_idx: 页码
  277. scale: 缩放比例
  278. Returns:
  279. 文本块列表 [{'text': str, 'bbox': [x1, y1, x2, y2]}, ...]
  280. """
  281. # 检测 PDF 文档类型
  282. doc_type = PDFUtils._detect_pdf_doc_type(pdf_doc)
  283. if doc_type == 'fitz':
  284. return PDFUtils._extract_all_text_blocks_fitz(pdf_doc, page_idx, scale)
  285. else: # pypdfium2
  286. return PDFUtils._extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale)
  287. @staticmethod
  288. def _extract_all_text_blocks_pypdfium2(
  289. pdf_doc: Any,
  290. page_idx: int,
  291. scale: float
  292. ) -> List[Dict[str, Any]]:
  293. """使用 pypdfium2 提取所有文本块(原有实现)"""
  294. if not MINERU_AVAILABLE or pdf_get_page_text is None:
  295. return []
  296. try:
  297. page = pdf_doc[page_idx]
  298. page_dict = pdf_get_page_text(page)
  299. extracted_blocks = []
  300. for block in page_dict.get('blocks', []):
  301. for line in block.get('lines', []):
  302. line_text = ""
  303. for span in line.get('spans', []):
  304. line_text += span.get('text', "")
  305. if not line_text.strip():
  306. continue
  307. line_bbox = line.get('bbox')
  308. if line_bbox and hasattr(line_bbox, 'bbox'):
  309. line_bbox = line_bbox.bbox
  310. elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
  311. line_bbox = list(line_bbox)
  312. else:
  313. continue
  314. img_bbox = [
  315. line_bbox[0] * scale,
  316. line_bbox[1] * scale,
  317. line_bbox[2] * scale,
  318. line_bbox[3] * scale
  319. ]
  320. extracted_blocks.append({
  321. 'text': line_text,
  322. 'bbox': img_bbox,
  323. 'origin_bbox': line_bbox
  324. })
  325. return extracted_blocks
  326. except Exception as e:
  327. logger.warning(f"pypdfium2 extract_all_text_blocks failed: {e}")
  328. import traceback
  329. logger.debug(traceback.format_exc())
  330. return []
  331. @staticmethod
  332. def _extract_all_text_blocks_fitz(
  333. pdf_doc: Any,
  334. page_idx: int,
  335. scale: float
  336. ) -> List[Dict[str, Any]]:
  337. """使用 fitz 提取所有文本块"""
  338. try:
  339. import fitz
  340. except ImportError:
  341. logger.warning("PyMuPDF (fitz) not available")
  342. return []
  343. try:
  344. page = pdf_doc[page_idx]
  345. # 使用 get_text("dict") 获取详细的文本信息
  346. text_dict = page.get_text("dict")
  347. extracted_blocks = []
  348. # 遍历所有 blocks
  349. for block in text_dict.get("blocks", []):
  350. # 只处理文本块(type=0)
  351. if block.get("type") != 0:
  352. continue
  353. # 遍历所有 lines
  354. for line in block.get("lines", []):
  355. line_text = ""
  356. line_bbox = line.get("bbox")
  357. # 提取 line 中的所有 span 文本
  358. for span in line.get("spans", []):
  359. line_text += span.get("text", "")
  360. if not line_text.strip() or not line_bbox:
  361. continue
  362. # PDF 坐标转换为图像坐标
  363. img_bbox = [
  364. line_bbox[0] * scale,
  365. line_bbox[1] * scale,
  366. line_bbox[2] * scale,
  367. line_bbox[3] * scale
  368. ]
  369. extracted_blocks.append({
  370. 'text': line_text,
  371. 'bbox': img_bbox,
  372. 'origin_bbox': list(line_bbox)
  373. })
  374. return extracted_blocks
  375. except Exception as e:
  376. logger.warning(f"fitz extract_all_text_blocks failed: {e}")
  377. import traceback
  378. logger.debug(traceback.format_exc())
  379. return []
  380. @staticmethod
  381. def _bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool:
  382. """检查两个 bbox 是否重叠"""
  383. if len(bbox1) < 4 or len(bbox2) < 4:
  384. return False
  385. x1_1, y1_1, x2_1, y2_1 = bbox1[:4]
  386. x1_2, y1_2, x2_2, y2_2 = bbox2[:4]
  387. if x2_1 < x1_2 or x2_2 < x1_1:
  388. return False
  389. if y2_1 < y1_2 or y2_2 < y1_1:
  390. return False
  391. return True
  392. @staticmethod
  393. def merge_cross_page_tables(results: Dict[str, Any]) -> Dict[str, Any]:
  394. """
  395. 合并跨页表格
  396. TODO: 实现跨页表格合并逻辑
  397. 可以参考 MinerU 的 cross_page_table_merge 实现
  398. Args:
  399. results: 处理结果字典
  400. Returns:
  401. 合并后的结果
  402. """
  403. # TODO: 实现跨页表格合并逻辑
  404. return results
  405. # ============================================================================
  406. # 统一的 PDF 图像加载函数 - 支持多种渲染引擎
  407. # ============================================================================
  408. def load_images_from_pdf_unified(
  409. pdf_bytes: bytes,
  410. dpi: int = 200,
  411. start_page_id: int = 0,
  412. end_page_id: Optional[int] = None,
  413. image_type: str = "PIL",
  414. renderer: str = "pypdfium2",
  415. timeout: Optional[int] = None,
  416. threads: int = 4,
  417. ) -> Tuple[List[Dict[str, Any]], Any]:
  418. """
  419. 从 PDF 加载图像,支持两种渲染引擎
  420. Args:
  421. pdf_bytes: PDF 文件的字节数据
  422. dpi: 渲染 DPI,默认 200
  423. start_page_id: 起始页码(0-based),默认 0
  424. end_page_id: 结束页码(0-based,包含),默认 None(处理到最后)
  425. image_type: 返回图像类型,"PIL" 或 "BASE64"
  426. renderer: 渲染引擎选择
  427. - "pypdfium2": 使用 MinerU 标准的 pypdfium2(推荐)
  428. * 优势: Chrome PDFium 引擎,多进程加速,更好的细节保留
  429. * 尺寸限制: 3500px,超过则动态调整 scale
  430. - "fitz" / "pymupdf": 使用 PyMuPDF (fitz)
  431. * 优势: MuPDF 引擎,简单直接,无需额外依赖
  432. * 尺寸限制: 4500px,超过则降到 72 DPI
  433. timeout: 超时时间(秒),仅 pypdfium2 支持
  434. threads: 进程数,仅 pypdfium2 支持多进程加速(Windows 下自动禁用)
  435. Returns:
  436. (images_list, pdf_doc)
  437. - images_list: 图像列表,每个元素为 {'img_pil': PIL.Image, 'scale': float}
  438. 或 {'img_base64': str, 'scale': float}(取决于 image_type)
  439. - pdf_doc: PDF 文档对象(pypdfium2.PdfDocument 或 fitz.Document)
  440. Raises:
  441. ImportError: 如果选择的渲染引擎不可用
  442. ValueError: 如果参数无效
  443. TimeoutError: 如果转换超时(仅 pypdfium2)
  444. 渲染引擎对比:
  445. ┌─────────────┬──────────────┬──────────────┐
  446. │ 特性 │ pypdfium2 │ fitz │
  447. ├─────────────┼──────────────┼──────────────┤
  448. │ 渲染引擎 │ Chrome PDFium│ MuPDF │
  449. │ 多进程加速 │ ✅ (非Windows)│ ❌ │
  450. │ 超时控制 │ ✅ │ ❌ │
  451. │ 尺寸限制 │ 3500px │ 4500px │
  452. │ 超限处理 │ 动态调整scale│ 降到72 DPI │
  453. │ 细节保留 │ 更好 │ 良好 │
  454. │ MinerU标准 │ ✅ │ ❌ │
  455. └─────────────┴──────────────┴──────────────┘
  456. 示例:
  457. # 使用 pypdfium2(推荐,MinerU 标准)
  458. images, doc = load_images_from_pdf_unified(
  459. pdf_bytes,
  460. dpi=200,
  461. renderer="pypdfium2",
  462. threads=4
  463. )
  464. # 使用 PyMuPDF (fitz)
  465. images, doc = load_images_from_pdf_unified(
  466. pdf_bytes,
  467. dpi=200,
  468. renderer="fitz"
  469. )
  470. # 访问图像
  471. for img_dict in images:
  472. pil_image = img_dict['img_pil']
  473. scale = img_dict['scale']
  474. # 处理图像...
  475. 注意事项:
  476. 1. pypdfium2 在生产环境中更推荐,因为它是 MinerU 的标准实现
  477. 2. 两种渲染引擎可能产生略有不同的图像(SSIM ≈ 0.945)
  478. 3. 建议在同一项目中保持使用同一渲染引擎,避免不一致
  479. 4. 如果需要与现有测试图像对比,使用相同的渲染引擎
  480. """
  481. renderer = renderer.lower()
  482. if renderer in ["pypdfium2", "pdfium"]:
  483. return _load_images_pypdfium2(
  484. pdf_bytes, dpi, start_page_id, end_page_id,
  485. image_type, timeout, threads
  486. )
  487. elif renderer in ["fitz", "pymupdf", "mupdf"]:
  488. return _load_images_fitz(
  489. pdf_bytes, dpi, start_page_id, end_page_id, image_type
  490. )
  491. else:
  492. raise ValueError(
  493. f"不支持的渲染引擎: {renderer}. "
  494. f"请使用 'pypdfium2' 或 'fitz'"
  495. )
  496. def _load_images_pypdfium2(
  497. pdf_bytes: bytes,
  498. dpi: int,
  499. start_page_id: int,
  500. end_page_id: Optional[int],
  501. image_type: str,
  502. timeout: Optional[int],
  503. threads: int
  504. ) -> Tuple[List[Dict[str, Any]], Any]:
  505. """使用 pypdfium2 渲染引擎(MinerU 标准)"""
  506. try:
  507. import pypdfium2 as pdfium
  508. from mineru.utils.pdf_image_tools import load_images_from_pdf as mineru_load_images
  509. from mineru.utils.enum_class import ImageType
  510. except ImportError as e:
  511. raise ImportError(
  512. f"pypdfium2 渲染引擎需要安装 MinerU: pip install mineru\n"
  513. f"原始错误: {e}"
  514. )
  515. # 转换 image_type
  516. img_type = ImageType.PIL if image_type.upper() == "PIL" else ImageType.BASE64
  517. # 使用 MinerU 的实现
  518. images_list, pdf_doc = mineru_load_images(
  519. pdf_bytes=pdf_bytes,
  520. dpi=dpi,
  521. start_page_id=start_page_id,
  522. end_page_id=end_page_id,
  523. image_type=img_type,
  524. timeout=timeout,
  525. threads=threads
  526. )
  527. logger.info(
  528. f"✅ pypdfium2 渲染完成: {len(images_list)} 页 "
  529. f"(DPI={dpi}, 多进程={threads})"
  530. )
  531. return images_list, pdf_doc
  532. def _load_images_fitz(
  533. pdf_bytes: bytes,
  534. dpi: int,
  535. start_page_id: int,
  536. end_page_id: Optional[int],
  537. image_type: str
  538. ) -> Tuple[List[Dict[str, Any]], Any]:
  539. """使用 PyMuPDF (fitz) 渲染引擎"""
  540. try:
  541. import fitz
  542. except ImportError as e:
  543. raise ImportError(
  544. f"PyMuPDF 渲染引擎需要安装: pip install PyMuPDF\n"
  545. f"原始错误: {e}"
  546. )
  547. from io import BytesIO
  548. import base64
  549. # 打开 PDF
  550. doc = fitz.open(stream=pdf_bytes, filetype="pdf")
  551. pdf_page_num = doc.page_count
  552. # 处理 end_page_id
  553. if end_page_id is None or end_page_id < 0:
  554. end_page_id = pdf_page_num - 1
  555. end_page_id = min(end_page_id, pdf_page_num - 1)
  556. # 渲染图像
  557. images_list = []
  558. mat = fitz.Matrix(dpi / 72, dpi / 72)
  559. for index in range(start_page_id, end_page_id + 1):
  560. page = doc[index]
  561. # 渲染为 pixmap
  562. pm = page.get_pixmap(matrix=mat, alpha=False)
  563. # 如果超过尺寸限制,降低到 72 DPI
  564. if pm.width > 4500 or pm.height > 4500:
  565. logger.warning(
  566. f"⚠️ 页面 {index} 尺寸过大 ({pm.width}x{pm.height}), "
  567. f"降低到 72 DPI"
  568. )
  569. mat_fallback = fitz.Matrix(1, 1) # 72 DPI
  570. pm = page.get_pixmap(matrix=mat_fallback, alpha=False)
  571. # 转换为 PIL Image
  572. pil_img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
  573. # 计算实际 scale
  574. page_rect = page.rect
  575. actual_scale = pm.width / page_rect.width
  576. # 构建返回字典
  577. image_dict = {
  578. 'img_pil': pil_img,
  579. 'scale': actual_scale
  580. }
  581. # 如果需要 BASE64
  582. if image_type.upper() == "BASE64":
  583. buffer = BytesIO()
  584. pil_img.save(buffer, format="JPEG")
  585. img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
  586. image_dict['img_base64'] = img_base64
  587. # 移除 img_pil 以节省内存
  588. del image_dict['img_pil']
  589. images_list.append(image_dict)
  590. logger.info(
  591. f"✅ PyMuPDF (fitz) 渲染完成: {len(images_list)} 页 "
  592. f"(DPI={dpi}, 单进程)"
  593. )
  594. return images_list, doc