pdf_utils.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. """
  2. PDF处理工具模块(重构版)
  3. 提供PDF相关处理功能的统一入口:
  4. - PDF加载与分类
  5. - PDF文本提取(支持 pypdfium2 和 fitz)
  6. - PDF图像渲染(支持多种引擎)
  7. - 坐标转换(PDF坐标 ↔ 图像坐标)
  8. - 跨页表格合并
  9. - 页面范围解析与过滤
  10. 本模块已重构为多个子模块:
  11. - pdf_coordinate_transform: 坐标转换功能
  12. - pdf_text_extraction: 文本提取功能
  13. - pdf_image_rendering: 图像渲染功能
  14. - pdf_utils: 高级API和统一入口(本文件)
  15. 为保持向后兼容性,所有原有函数都从新模块重新导出。
  16. """
  17. from typing import Dict, List, Any, Optional, Tuple, Set
  18. from pathlib import Path
  19. from PIL import Image
  20. from loguru import logger
  21. # 导入页面范围解析函数(不依赖 MinerU)
  22. from .file_utils import parse_page_range
  23. # 从子模块导入功能
  24. from .pdf_coordinate_transform import (
  25. transform_bbox_for_rotation_fitz,
  26. transform_bbox_for_rotation_pypdfium2,
  27. pdf_rotation_to_image_rotation,
  28. )
  29. from .pdf_text_extraction import (
  30. detect_pdf_doc_type,
  31. bbox_overlap,
  32. extract_text_from_pdf,
  33. extract_text_from_pdf_pypdfium2,
  34. extract_text_from_pdf_fitz,
  35. extract_all_text_blocks,
  36. extract_all_text_blocks_pypdfium2,
  37. extract_all_text_blocks_fitz,
  38. detect_page_type,
  39. get_page_rotation,
  40. )
  41. from .pdf_image_rendering import (
  42. load_images_from_pdf_unified,
  43. load_images_pypdfium2,
  44. load_images_fitz,
  45. )
  46. # 导入 MinerU 组件
  47. try:
  48. from .pdf_classify import classify as pdf_classify
  49. from mineru.utils.enum_class import ImageType
  50. MINERU_AVAILABLE = True
  51. except ImportError:
  52. raise ImportError("MinerU components not available for PDF processing")
  53. class PDFUtils:
  54. """
  55. PDF处理工具类(重构版)
  56. 本类提供PDF处理的高级API,内部调用已重构的子模块功能。
  57. 保持原有接口不变,确保向后兼容性。
  58. 子模块:
  59. - pdf_coordinate_transform: 坐标转换
  60. - pdf_text_extraction: 文本提取
  61. - pdf_image_rendering: 图像渲染
  62. """
  63. @staticmethod
  64. def parse_page_range(page_range: Optional[str], total_pages: int) -> Set[int]:
  65. """
  66. 解析页面范围字符串(向后兼容包装函数)
  67. 此方法是对 file_utils.parse_page_range 的包装,保持向后兼容性。
  68. 新代码应直接使用 file_utils.parse_page_range。
  69. 支持格式:
  70. - "1-5" → {0, 1, 2, 3, 4}(页码从1开始,内部转为0-based索引)
  71. - "3" → {2}
  72. - "1-5,7,9-12" → {0, 1, 2, 3, 4, 6, 8, 9, 10, 11}
  73. - "1-" → 从第1页到最后
  74. - "-5" → 从第1页到第5页
  75. Args:
  76. page_range: 页面范围字符串(页码从1开始)
  77. total_pages: 总页数
  78. Returns:
  79. 页面索引集合(0-based)
  80. """
  81. return parse_page_range(page_range, total_pages)
  82. @staticmethod
  83. def _detect_pdf_doc_type(pdf_doc: Any) -> str:
  84. """
  85. 检测 PDF 文档对象类型(向后兼容包装)
  86. Args:
  87. pdf_doc: PDF 文档对象
  88. Returns:
  89. 'pypdfium2' 或 'fitz'
  90. """
  91. return detect_pdf_doc_type(pdf_doc)
  92. @staticmethod
  93. def load_and_classify_document(
  94. document_path: Path,
  95. dpi: int = 200,
  96. page_range: Optional[str] = None,
  97. renderer: str = "fitz",
  98. pdf_bytes: Optional[bytes] = None,
  99. ) -> Tuple[List[Dict], str, Optional[Any], str]:
  100. """
  101. 加载文档并分类,支持页面范围过滤
  102. Args:
  103. document_path: 文档路径
  104. dpi: PDF渲染DPI
  105. page_range: 页面范围字符串,如 "1-5,7,9-12"
  106. - PDF:按页码(从1开始)
  107. - 图片目录:按文件名排序后的位置(从1开始)
  108. renderer: PDF渲染引擎,"fitz" 或 "pypdfium2"
  109. pdf_bytes: 可选的 PDF 字节数据;若提供则跳过从文件读取(用于内存中预处理后的 PDF)
  110. Returns:
  111. (images_list, pdf_type, pdf_doc, renderer_used)
  112. - images_list: 图像列表,每个元素包含 {'img_pil': PIL.Image, 'scale': float, 'page_idx': int}
  113. - pdf_type: 'ocr' 或 'txt'
  114. - pdf_doc: PDF文档对象(如果是PDF)
  115. - renderer_used: 实际使用的渲染器类型
  116. """
  117. pdf_doc = None
  118. pdf_type = 'ocr' # 默认使用OCR模式
  119. all_images = []
  120. if document_path.is_dir():
  121. # 处理目录:遍历所有图片
  122. image_extensions = {'.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif'}
  123. image_files = sorted([
  124. f for f in document_path.iterdir()
  125. if f.suffix.lower() in image_extensions
  126. ])
  127. # 解析页面范围
  128. total_pages = len(image_files)
  129. selected_pages = parse_page_range(page_range, total_pages)
  130. if page_range:
  131. logger.info(f"📋 图片目录共 {total_pages} 张,选择处理 {len(selected_pages)} 张")
  132. for idx, img_file in enumerate(image_files):
  133. if idx not in selected_pages:
  134. continue
  135. img = Image.open(img_file)
  136. if img.mode != 'RGB':
  137. img = img.convert('RGB')
  138. all_images.append({
  139. 'img_pil': img,
  140. 'scale': 1.0,
  141. 'source_path': str(img_file),
  142. 'page_idx': idx,
  143. 'page_name': img_file.stem
  144. })
  145. pdf_type = 'ocr'
  146. elif document_path.suffix.lower() == '.pdf':
  147. # 处理PDF文件
  148. if not MINERU_AVAILABLE:
  149. raise RuntimeError("MinerU components not available for PDF processing")
  150. if pdf_bytes is None:
  151. with open(document_path, 'rb') as f:
  152. pdf_bytes = f.read()
  153. # PDF分类
  154. pdf_type = pdf_classify(pdf_bytes)
  155. logger.info(f"📋 PDF classified as: {pdf_type}")
  156. # 加载图像(使用重构后的函数)
  157. images_list, pdf_doc = load_images_from_pdf_unified(
  158. pdf_bytes,
  159. dpi=dpi,
  160. image_type=ImageType.PIL,
  161. renderer=renderer
  162. )
  163. # 解析页面范围
  164. total_pages = len(images_list)
  165. selected_pages = parse_page_range(page_range, total_pages)
  166. if page_range:
  167. logger.info(f"📋 PDF 共 {total_pages} 页,选择处理 {len(selected_pages)} 页")
  168. for idx, img_dict in enumerate(images_list):
  169. if idx not in selected_pages:
  170. continue
  171. all_images.append({
  172. 'img_pil': img_dict['img_pil'],
  173. 'scale': img_dict.get('scale', dpi / 72),
  174. 'source_path': str(document_path),
  175. 'page_idx': idx,
  176. 'page_name': f"{document_path.stem}_page_{idx + 1:03d}"
  177. })
  178. elif document_path.suffix.lower() in ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif']:
  179. # 处理单个图片
  180. img = Image.open(document_path)
  181. if img.mode != 'RGB':
  182. img = img.convert('RGB')
  183. all_images.append({
  184. 'img_pil': img,
  185. 'scale': 1.0,
  186. 'source_path': str(document_path),
  187. 'page_idx': 0,
  188. 'page_name': document_path.stem
  189. })
  190. pdf_type = 'ocr'
  191. else:
  192. raise ValueError(f"Unsupported file format: {document_path.suffix}")
  193. return all_images, pdf_type, pdf_doc, renderer
  194. @staticmethod
  195. def _transform_bbox_for_rotation_fitz(
  196. bbox: List[float],
  197. rotation: int,
  198. pdf_width: float,
  199. pdf_height: float,
  200. scale: float
  201. ) -> List[float]:
  202. """向后兼容包装:fitz引擎坐标转换"""
  203. return transform_bbox_for_rotation_fitz(bbox, rotation, pdf_width, pdf_height, scale)
  204. @staticmethod
  205. def _transform_bbox_for_rotation_pypdfium2(
  206. bbox: List[float],
  207. rotation: int,
  208. pdf_width: float,
  209. pdf_height: float,
  210. scale: float
  211. ) -> List[float]:
  212. """向后兼容包装:pypdfium2引擎坐标转换"""
  213. return transform_bbox_for_rotation_pypdfium2(bbox, rotation, pdf_width, pdf_height, scale)
  214. # ========================================================================
  215. # 文本提取函数(向后兼容包装)
  216. # ========================================================================
  217. # ========================================================================
  218. # 文本提取函数(向后兼容包装)
  219. # ========================================================================
  220. @staticmethod
  221. def extract_text_from_pdf(
  222. pdf_doc: Any,
  223. page_idx: int,
  224. bbox: List[float],
  225. scale: float
  226. ) -> Tuple[str, bool]:
  227. """向后兼容包装:从PDF指定区域提取文本"""
  228. return extract_text_from_pdf(pdf_doc, page_idx, bbox, scale)
  229. @staticmethod
  230. def _extract_text_from_pdf_pypdfium2(
  231. pdf_doc: Any,
  232. page_idx: int,
  233. bbox: List[float],
  234. scale: float
  235. ) -> Tuple[str, bool]:
  236. """向后兼容包装:使用pypdfium2提取文本"""
  237. return extract_text_from_pdf_pypdfium2(pdf_doc, page_idx, bbox, scale)
  238. @staticmethod
  239. def _extract_text_from_pdf_fitz(
  240. pdf_doc: Any,
  241. page_idx: int,
  242. bbox: List[float],
  243. scale: float
  244. ) -> Tuple[str, bool]:
  245. """向后兼容包装:使用fitz提取文本"""
  246. return extract_text_from_pdf_fitz(pdf_doc, page_idx, bbox, scale)
  247. @staticmethod
  248. def extract_all_text_blocks(
  249. pdf_doc: Any,
  250. page_idx: int,
  251. scale: float,
  252. return_upright_coords: bool = True,
  253. ) -> Tuple[List[Dict[str, Any]], int]:
  254. """向后兼容包装:提取页面所有文本块
  255. Args:
  256. return_upright_coords: 是否返回正视坐标(True=正视,False=旋转后)
  257. """
  258. return extract_all_text_blocks(pdf_doc, page_idx, scale, return_upright_coords)
  259. @staticmethod
  260. def _extract_all_text_blocks_pypdfium2(
  261. pdf_doc: Any,
  262. page_idx: int,
  263. scale: float
  264. ) -> Tuple[List[Dict[str, Any]], int]:
  265. """向后兼容包装:使用pypdfium2提取所有文本块"""
  266. return extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale)
  267. @staticmethod
  268. def _extract_all_text_blocks_fitz(
  269. pdf_doc: Any,
  270. page_idx: int,
  271. scale: float
  272. ) -> Tuple[List[Dict[str, Any]], int]:
  273. """向后兼容包装:使用fitz提取所有文本块"""
  274. return extract_all_text_blocks_fitz(pdf_doc, page_idx, scale)
  275. @staticmethod
  276. def _bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool:
  277. """向后兼容包装:检查两个bbox是否重叠"""
  278. return bbox_overlap(bbox1, bbox2)
  279. # ========================================================================
  280. # 图像渲染函数(向后兼容包装)
  281. # ========================================================================
  282. @staticmethod
  283. def load_images_from_pdf_unified(
  284. pdf_bytes: bytes,
  285. dpi: int = 200,
  286. start_page_id: int = 0,
  287. end_page_id: Optional[int] = None,
  288. image_type: str = "PIL",
  289. renderer: str = "pypdfium2",
  290. timeout: Optional[int] = None,
  291. threads: int = 4,
  292. ) -> Tuple[List[Dict[str, Any]], Any]:
  293. """向后兼容包装:统一的PDF图像加载接口"""
  294. return load_images_from_pdf_unified(
  295. pdf_bytes, dpi, start_page_id, end_page_id,
  296. image_type, renderer, timeout, threads
  297. )
  298. @staticmethod
  299. def _load_images_pypdfium2(
  300. pdf_bytes: bytes,
  301. dpi: int,
  302. start_page_id: int,
  303. end_page_id: Optional[int],
  304. image_type: str,
  305. timeout: Optional[int],
  306. threads: int
  307. ) -> Tuple[List[Dict[str, Any]], Any]:
  308. """向后兼容包装:使用pypdfium2渲染"""
  309. return load_images_pypdfium2(
  310. pdf_bytes, dpi, start_page_id, end_page_id,
  311. image_type, timeout, threads
  312. )
  313. @staticmethod
  314. def _load_images_fitz(
  315. pdf_bytes: bytes,
  316. dpi: int,
  317. start_page_id: int,
  318. end_page_id: Optional[int],
  319. image_type: str
  320. ) -> Tuple[List[Dict[str, Any]], Any]:
  321. """向后兼容包装:使用fitz渲染"""
  322. return load_images_fitz(
  323. pdf_bytes, dpi, start_page_id, end_page_id, image_type
  324. )
  325. @staticmethod
  326. def detect_page_type(
  327. pdf_doc: Any,
  328. page_idx: int,
  329. char_threshold: int = 50
  330. ) -> str:
  331. """
  332. 检测页面类型(文本PDF或扫描OCR)
  333. Returns:
  334. 页面类型:'txt' 或 'ocr'
  335. """
  336. return detect_page_type(pdf_doc, page_idx, char_threshold)
  337. @staticmethod
  338. def get_page_rotation(pdf_doc: Any, page_idx: int) -> int:
  339. """
  340. 获取PDF页面的旋转角度(逆时针定义,用于图像旋转)
  341. 返回的角度可直接用于PIL.rotate()等图像旋转函数。
  342. Args:
  343. pdf_doc: PDF文档对象 (pypdfium2.PdfDocument 或 fitz.Document)
  344. page_idx: 页码索引(0-based)
  345. Returns:
  346. 旋转角度:0/90/180/270(逆时针旋转角度)
  347. Examples:
  348. >>> rotate_angle = PDFUtils.get_page_rotation(pdf_doc, 0)
  349. >>> if rotate_angle != 0:
  350. >>> image = image.rotate(-rotate_angle, expand=True) # 旋转为正视
  351. """
  352. return get_page_rotation(pdf_doc, page_idx)
  353. # ========================================================================
  354. # 其他功能
  355. # ========================================================================
  356. @staticmethod
  357. def merge_cross_page_tables(results: Dict[str, Any]) -> Dict[str, Any]:
  358. """
  359. 合并跨页表格
  360. TODO: 实现跨页表格合并逻辑
  361. 可以参考 MinerU 的 cross_page_table_merge 实现
  362. Args:
  363. results: 处理结果字典
  364. Returns:
  365. 合并后的结果
  366. """
  367. # TODO: 实现跨页表格合并逻辑
  368. return results