pdf_utils.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492
  1. """
  2. PDF处理工具模块
  3. 提供PDF相关处理功能:
  4. - PDF加载与分类
  5. - PDF文本提取
  6. - 跨页表格合并
  7. - 页面范围解析与过滤
  8. """
  9. from typing import Dict, List, Any, Optional, Tuple, Set
  10. from pathlib import Path
  11. from PIL import Image
  12. from loguru import logger
  13. import re
  14. # 导入页面范围解析函数(不依赖 MinerU)
  15. from .file_utils import parse_page_range
  16. # 导入 MinerU 组件
  17. try:
  18. from mineru.utils.pdf_classify import classify as pdf_classify
  19. from mineru.utils.pdf_image_tools import load_images_from_pdf
  20. from mineru.utils.enum_class import ImageType
  21. from mineru.utils.pdf_text_tool import get_page as pdf_get_page_text
  22. MINERU_AVAILABLE = True
  23. except ImportError:
  24. raise ImportError("MinerU components not available for PDF processing")
  25. class PDFUtils:
  26. """PDF处理工具类"""
  27. @staticmethod
  28. def parse_page_range(page_range: Optional[str], total_pages: int) -> Set[int]:
  29. """
  30. 解析页面范围字符串(向后兼容包装函数)
  31. 此方法是对 file_utils.parse_page_range 的包装,保持向后兼容性。
  32. 新代码应直接使用 file_utils.parse_page_range。
  33. 支持格式:
  34. - "1-5" → {0, 1, 2, 3, 4}(页码从1开始,内部转为0-based索引)
  35. - "3" → {2}
  36. - "1-5,7,9-12" → {0, 1, 2, 3, 4, 6, 8, 9, 10, 11}
  37. - "1-" → 从第1页到最后
  38. - "-5" → 从第1页到第5页
  39. Args:
  40. page_range: 页面范围字符串(页码从1开始)
  41. total_pages: 总页数
  42. Returns:
  43. 页面索引集合(0-based)
  44. """
  45. return parse_page_range(page_range, total_pages)
  46. @staticmethod
  47. def load_and_classify_document(
  48. document_path: Path,
  49. dpi: int = 200,
  50. page_range: Optional[str] = None
  51. ) -> Tuple[List[Dict], str, Optional[Any]]:
  52. """
  53. 加载文档并分类,支持页面范围过滤
  54. Args:
  55. document_path: 文档路径
  56. dpi: PDF渲染DPI
  57. page_range: 页面范围字符串,如 "1-5,7,9-12"
  58. - PDF:按页码(从1开始)
  59. - 图片目录:按文件名排序后的位置(从1开始)
  60. Returns:
  61. (images_list, pdf_type, pdf_doc)
  62. - images_list: 图像列表,每个元素包含 {'img_pil': PIL.Image, 'scale': float, 'page_idx': int}
  63. - pdf_type: 'ocr' 或 'txt'
  64. - pdf_doc: PDF文档对象(如果是PDF)
  65. """
  66. pdf_doc = None
  67. pdf_type = 'ocr' # 默认使用OCR模式
  68. all_images = []
  69. if document_path.is_dir():
  70. # 处理目录:遍历所有图片
  71. image_extensions = {'.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif'}
  72. image_files = sorted([
  73. f for f in document_path.iterdir()
  74. if f.suffix.lower() in image_extensions
  75. ])
  76. # 解析页面范围
  77. total_pages = len(image_files)
  78. selected_pages = parse_page_range(page_range, total_pages)
  79. if page_range:
  80. logger.info(f"📋 图片目录共 {total_pages} 张,选择处理 {len(selected_pages)} 张")
  81. for idx, img_file in enumerate(image_files):
  82. if idx not in selected_pages:
  83. continue
  84. img = Image.open(img_file)
  85. if img.mode != 'RGB':
  86. img = img.convert('RGB')
  87. all_images.append({
  88. 'img_pil': img,
  89. 'scale': 1.0,
  90. 'source_path': str(img_file),
  91. 'page_idx': idx, # 原始索引
  92. 'page_name': img_file.stem # 文件名(不含扩展名)
  93. })
  94. pdf_type = 'ocr' # 图片目录始终使用OCR模式
  95. elif document_path.suffix.lower() == '.pdf':
  96. # 处理PDF文件
  97. if not MINERU_AVAILABLE:
  98. raise RuntimeError("MinerU components not available for PDF processing")
  99. with open(document_path, 'rb') as f:
  100. pdf_bytes = f.read()
  101. # PDF分类
  102. pdf_type = pdf_classify(pdf_bytes)
  103. logger.info(f"📋 PDF classified as: {pdf_type}")
  104. # 加载图像
  105. images_list, pdf_doc = load_images_from_pdf_unified(
  106. pdf_bytes,
  107. dpi=dpi,
  108. image_type=ImageType.PIL,
  109. renderer='fitz'
  110. )
  111. # 解析页面范围
  112. total_pages = len(images_list)
  113. selected_pages = parse_page_range(page_range, total_pages)
  114. if page_range:
  115. logger.info(f"📋 PDF 共 {total_pages} 页,选择处理 {len(selected_pages)} 页")
  116. for idx, img_dict in enumerate(images_list):
  117. if idx not in selected_pages:
  118. continue
  119. all_images.append({
  120. 'img_pil': img_dict['img_pil'],
  121. 'scale': img_dict.get('scale', dpi / 72),
  122. 'source_path': str(document_path),
  123. 'page_idx': idx, # 原始页码索引
  124. 'page_name': f"{document_path.stem}_page_{idx + 1:03d}"
  125. })
  126. elif document_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif']:
  127. # 处理单个图片
  128. img = Image.open(document_path)
  129. if img.mode != 'RGB':
  130. img = img.convert('RGB')
  131. all_images.append({
  132. 'img_pil': img,
  133. 'scale': 1.0,
  134. 'source_path': str(document_path),
  135. 'page_idx': 0,
  136. 'page_name': document_path.stem
  137. })
  138. pdf_type = 'ocr'
  139. else:
  140. raise ValueError(f"Unsupported file format: {document_path.suffix}")
  141. return all_images, pdf_type, pdf_doc
  142. @staticmethod
  143. def extract_text_from_pdf(
  144. pdf_doc: Any,
  145. page_idx: int,
  146. bbox: List[float],
  147. scale: float
  148. ) -> Tuple[str, bool]:
  149. """
  150. 从PDF直接提取文本(使用 MinerU 的 pypdfium2 方式)
  151. Args:
  152. pdf_doc: pypdfium2 的 PdfDocument 对象
  153. page_idx: 页码索引
  154. bbox: 目标区域的bbox(图像坐标)
  155. scale: 图像与PDF的缩放比例
  156. Returns:
  157. (text, success)
  158. """
  159. if not MINERU_AVAILABLE or pdf_get_page_text is None:
  160. logger.debug("MinerU pdf_text_tool not available")
  161. return "", False
  162. try:
  163. page = pdf_doc[page_idx]
  164. # 将图像坐标转换为PDF坐标
  165. pdf_bbox = [
  166. bbox[0] / scale,
  167. bbox[1] / scale,
  168. bbox[2] / scale,
  169. bbox[3] / scale
  170. ]
  171. # 使用 MinerU 的方式获取页面文本信息
  172. page_dict = pdf_get_page_text(page)
  173. # 从 blocks 中提取与 bbox 重叠的文本
  174. text_parts = []
  175. for block in page_dict.get('blocks', []):
  176. for line in block.get('lines', []):
  177. line_bbox = line.get('bbox')
  178. if line_bbox and hasattr(line_bbox, 'bbox'):
  179. line_bbox = line_bbox.bbox # pdftext 的 BBox 对象
  180. elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
  181. line_bbox = list(line_bbox)
  182. else:
  183. continue
  184. # 检查 line 是否与目标 bbox 重叠
  185. if PDFUtils._bbox_overlap(pdf_bbox, line_bbox):
  186. for span in line.get('spans', []):
  187. span_text = span.get('text', '')
  188. if span_text:
  189. text_parts.append(span_text)
  190. text = ' '.join(text_parts)
  191. return text.strip(), bool(text.strip())
  192. except Exception as e:
  193. import traceback
  194. logger.debug(f"PDF text extraction error: {e}")
  195. logger.debug(traceback.format_exc())
  196. return "", False
  197. @staticmethod
  198. def _bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool:
  199. """检查两个 bbox 是否重叠"""
  200. if len(bbox1) < 4 or len(bbox2) < 4:
  201. return False
  202. x1_1, y1_1, x2_1, y2_1 = bbox1[:4]
  203. x1_2, y1_2, x2_2, y2_2 = bbox2[:4]
  204. if x2_1 < x1_2 or x2_2 < x1_1:
  205. return False
  206. if y2_1 < y1_2 or y2_2 < y1_1:
  207. return False
  208. return True
  209. @staticmethod
  210. def merge_cross_page_tables(results: Dict[str, Any]) -> Dict[str, Any]:
  211. """
  212. 合并跨页表格
  213. TODO: 实现跨页表格合并逻辑
  214. 可以参考 MinerU 的 cross_page_table_merge 实现
  215. Args:
  216. results: 处理结果字典
  217. Returns:
  218. 合并后的结果
  219. """
  220. # TODO: 实现跨页表格合并逻辑
  221. return results
  222. # ============================================================================
  223. # 统一的 PDF 图像加载函数 - 支持多种渲染引擎
  224. # ============================================================================
  225. def load_images_from_pdf_unified(
  226. pdf_bytes: bytes,
  227. dpi: int = 200,
  228. start_page_id: int = 0,
  229. end_page_id: Optional[int] = None,
  230. image_type: str = "PIL",
  231. renderer: str = "pypdfium2",
  232. timeout: Optional[int] = None,
  233. threads: int = 4,
  234. ) -> Tuple[List[Dict[str, Any]], Any]:
  235. """
  236. 从 PDF 加载图像,支持两种渲染引擎
  237. Args:
  238. pdf_bytes: PDF 文件的字节数据
  239. dpi: 渲染 DPI,默认 200
  240. start_page_id: 起始页码(0-based),默认 0
  241. end_page_id: 结束页码(0-based,包含),默认 None(处理到最后)
  242. image_type: 返回图像类型,"PIL" 或 "BASE64"
  243. renderer: 渲染引擎选择
  244. - "pypdfium2": 使用 MinerU 标准的 pypdfium2(推荐)
  245. * 优势: Chrome PDFium 引擎,多进程加速,更好的细节保留
  246. * 尺寸限制: 3500px,超过则动态调整 scale
  247. - "fitz" / "pymupdf": 使用 PyMuPDF (fitz)
  248. * 优势: MuPDF 引擎,简单直接,无需额外依赖
  249. * 尺寸限制: 4500px,超过则降到 72 DPI
  250. timeout: 超时时间(秒),仅 pypdfium2 支持
  251. threads: 进程数,仅 pypdfium2 支持多进程加速(Windows 下自动禁用)
  252. Returns:
  253. (images_list, pdf_doc)
  254. - images_list: 图像列表,每个元素为 {'img_pil': PIL.Image, 'scale': float}
  255. 或 {'img_base64': str, 'scale': float}(取决于 image_type)
  256. - pdf_doc: PDF 文档对象(pypdfium2.PdfDocument 或 fitz.Document)
  257. Raises:
  258. ImportError: 如果选择的渲染引擎不可用
  259. ValueError: 如果参数无效
  260. TimeoutError: 如果转换超时(仅 pypdfium2)
  261. 渲染引擎对比:
  262. ┌─────────────┬──────────────┬──────────────┐
  263. │ 特性 │ pypdfium2 │ fitz │
  264. ├─────────────┼──────────────┼──────────────┤
  265. │ 渲染引擎 │ Chrome PDFium│ MuPDF │
  266. │ 多进程加速 │ ✅ (非Windows)│ ❌ │
  267. │ 超时控制 │ ✅ │ ❌ │
  268. │ 尺寸限制 │ 3500px │ 4500px │
  269. │ 超限处理 │ 动态调整scale│ 降到72 DPI │
  270. │ 细节保留 │ 更好 │ 良好 │
  271. │ MinerU标准 │ ✅ │ ❌ │
  272. └─────────────┴──────────────┴──────────────┘
  273. 示例:
  274. # 使用 pypdfium2(推荐,MinerU 标准)
  275. images, doc = load_images_from_pdf_unified(
  276. pdf_bytes,
  277. dpi=200,
  278. renderer="pypdfium2",
  279. threads=4
  280. )
  281. # 使用 PyMuPDF (fitz)
  282. images, doc = load_images_from_pdf_unified(
  283. pdf_bytes,
  284. dpi=200,
  285. renderer="fitz"
  286. )
  287. # 访问图像
  288. for img_dict in images:
  289. pil_image = img_dict['img_pil']
  290. scale = img_dict['scale']
  291. # 处理图像...
  292. 注意事项:
  293. 1. pypdfium2 在生产环境中更推荐,因为它是 MinerU 的标准实现
  294. 2. 两种渲染引擎可能产生略有不同的图像(SSIM ≈ 0.945)
  295. 3. 建议在同一项目中保持使用同一渲染引擎,避免不一致
  296. 4. 如果需要与现有测试图像对比,使用相同的渲染引擎
  297. """
  298. renderer = renderer.lower()
  299. if renderer in ["pypdfium2", "pdfium"]:
  300. return _load_images_pypdfium2(
  301. pdf_bytes, dpi, start_page_id, end_page_id,
  302. image_type, timeout, threads
  303. )
  304. elif renderer in ["fitz", "pymupdf", "mupdf"]:
  305. return _load_images_fitz(
  306. pdf_bytes, dpi, start_page_id, end_page_id, image_type
  307. )
  308. else:
  309. raise ValueError(
  310. f"不支持的渲染引擎: {renderer}. "
  311. f"请使用 'pypdfium2' 或 'fitz'"
  312. )
  313. def _load_images_pypdfium2(
  314. pdf_bytes: bytes,
  315. dpi: int,
  316. start_page_id: int,
  317. end_page_id: Optional[int],
  318. image_type: str,
  319. timeout: Optional[int],
  320. threads: int
  321. ) -> Tuple[List[Dict[str, Any]], Any]:
  322. """使用 pypdfium2 渲染引擎(MinerU 标准)"""
  323. try:
  324. import pypdfium2 as pdfium
  325. from mineru.utils.pdf_image_tools import load_images_from_pdf as mineru_load_images
  326. from mineru.utils.enum_class import ImageType
  327. except ImportError as e:
  328. raise ImportError(
  329. f"pypdfium2 渲染引擎需要安装 MinerU: pip install mineru\n"
  330. f"原始错误: {e}"
  331. )
  332. # 转换 image_type
  333. img_type = ImageType.PIL if image_type.upper() == "PIL" else ImageType.BASE64
  334. # 使用 MinerU 的实现
  335. images_list, pdf_doc = mineru_load_images(
  336. pdf_bytes=pdf_bytes,
  337. dpi=dpi,
  338. start_page_id=start_page_id,
  339. end_page_id=end_page_id,
  340. image_type=img_type,
  341. timeout=timeout,
  342. threads=threads
  343. )
  344. logger.info(
  345. f"✅ pypdfium2 渲染完成: {len(images_list)} 页 "
  346. f"(DPI={dpi}, 多进程={threads})"
  347. )
  348. return images_list, pdf_doc
  349. def _load_images_fitz(
  350. pdf_bytes: bytes,
  351. dpi: int,
  352. start_page_id: int,
  353. end_page_id: Optional[int],
  354. image_type: str
  355. ) -> Tuple[List[Dict[str, Any]], Any]:
  356. """使用 PyMuPDF (fitz) 渲染引擎"""
  357. try:
  358. import fitz
  359. except ImportError as e:
  360. raise ImportError(
  361. f"PyMuPDF 渲染引擎需要安装: pip install PyMuPDF\n"
  362. f"原始错误: {e}"
  363. )
  364. from io import BytesIO
  365. import base64
  366. # 打开 PDF
  367. doc = fitz.open(stream=pdf_bytes, filetype="pdf")
  368. pdf_page_num = doc.page_count
  369. # 处理 end_page_id
  370. if end_page_id is None or end_page_id < 0:
  371. end_page_id = pdf_page_num - 1
  372. end_page_id = min(end_page_id, pdf_page_num - 1)
  373. # 渲染图像
  374. images_list = []
  375. mat = fitz.Matrix(dpi / 72, dpi / 72)
  376. for index in range(start_page_id, end_page_id + 1):
  377. page = doc[index]
  378. # 渲染为 pixmap
  379. pm = page.get_pixmap(matrix=mat, alpha=False)
  380. # 如果超过尺寸限制,降低到 72 DPI
  381. if pm.width > 4500 or pm.height > 4500:
  382. logger.warning(
  383. f"⚠️ 页面 {index} 尺寸过大 ({pm.width}x{pm.height}), "
  384. f"降低到 72 DPI"
  385. )
  386. mat_fallback = fitz.Matrix(1, 1) # 72 DPI
  387. pm = page.get_pixmap(matrix=mat_fallback, alpha=False)
  388. # 转换为 PIL Image
  389. pil_img = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
  390. # 计算实际 scale
  391. page_rect = page.rect
  392. actual_scale = pm.width / page_rect.width
  393. # 构建返回字典
  394. image_dict = {
  395. 'img_pil': pil_img,
  396. 'scale': actual_scale
  397. }
  398. # 如果需要 BASE64
  399. if image_type.upper() == "BASE64":
  400. buffer = BytesIO()
  401. pil_img.save(buffer, format="JPEG")
  402. img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
  403. image_dict['img_base64'] = img_base64
  404. # 移除 img_pil 以节省内存
  405. del image_dict['img_pil']
  406. images_list.append(image_dict)
  407. logger.info(
  408. f"✅ PyMuPDF (fitz) 渲染完成: {len(images_list)} 页 "
  409. f"(DPI={dpi}, 单进程)"
  410. )
  411. return images_list, doc