pdf_text_extraction.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
  1. """
  2. PDF文本提取模块
  3. 提供从PDF文档中提取文本的功能,支持多种PDF引擎:
  4. - pypdfium2: MinerU标准引擎
  5. - fitz (PyMuPDF): 轻量级替代引擎
  6. 主要功能:
  7. - 区域文本提取:从指定bbox区域提取文本
  8. - 全页文本提取:提取页面所有文本块及其坐标
  9. - 自动rotation处理:自动应用PDF页面旋转变换
  10. - 返回图片rotation(逆时针定义):对外统一使用图片处理标准
  11. """
  12. from typing import Dict, List, Any, Tuple, Optional
  13. from loguru import logger
  14. # 导入坐标转换函数
  15. from .pdf_coordinate_transform import (
  16. transform_bbox_for_rotation_fitz,
  17. transform_bbox_for_rotation_pypdfium2,
  18. pdf_rotation_to_image_rotation
  19. )
  20. # 导入 MinerU 组件
  21. try:
  22. from mineru.utils.pdf_text_tool import get_page as pdf_get_page_text
  23. MINERU_AVAILABLE = True
  24. except ImportError:
  25. pdf_get_page_text = None
  26. MINERU_AVAILABLE = False
  27. def detect_pdf_doc_type(pdf_doc: Any) -> str:
  28. """
  29. 检测 PDF 文档对象类型
  30. Args:
  31. pdf_doc: PDF 文档对象
  32. Returns:
  33. 'pypdfium2' 或 'fitz'
  34. """
  35. doc_type_name = type(pdf_doc).__name__
  36. doc_module = type(pdf_doc).__module__
  37. if 'pdfium' in doc_module.lower() or 'PdfDocument' in doc_type_name:
  38. return 'pypdfium2'
  39. elif 'fitz' in doc_module.lower() or 'Document' in doc_type_name:
  40. return 'fitz'
  41. else:
  42. # 尝试通过属性判断
  43. if hasattr(pdf_doc, 'get_page') or hasattr(pdf_doc, 'page_count'):
  44. return 'fitz'
  45. else:
  46. return 'pypdfium2'
  47. def bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool:
  48. """
  49. 检查两个 bbox 是否重叠
  50. Args:
  51. bbox1: 第一个bbox [x1, y1, x2, y2]
  52. bbox2: 第二个bbox [x1, y1, x2, y2]
  53. Returns:
  54. True 如果重叠,否则 False
  55. """
  56. if len(bbox1) < 4 or len(bbox2) < 4:
  57. return False
  58. x1_1, y1_1, x2_1, y2_1 = bbox1[:4]
  59. x1_2, y1_2, x2_2, y2_2 = bbox2[:4]
  60. # 检查是否不重叠(取反)
  61. if x2_1 < x1_2 or x2_2 < x1_1:
  62. return False
  63. if y2_1 < y1_2 or y2_2 < y1_1:
  64. return False
  65. return True
  66. # ============================================================================
  67. # 区域文本提取
  68. # ============================================================================
  69. def extract_text_from_pdf(
  70. pdf_doc: Any,
  71. page_idx: int,
  72. bbox: List[float],
  73. scale: float
  74. ) -> Tuple[str, bool]:
  75. """
  76. 从PDF指定区域提取文本(支持 pypdfium2 和 fitz)
  77. Args:
  78. pdf_doc: PDF文档对象 (pypdfium2.PdfDocument 或 fitz.Document)
  79. page_idx: 页码索引(0-based)
  80. bbox: 目标区域的bbox(图像坐标)[x1, y1, x2, y2]
  81. scale: 图像与PDF的缩放比例
  82. Returns:
  83. (text, success)
  84. - text: 提取的文本
  85. - success: 是否成功提取到文本
  86. """
  87. doc_type = detect_pdf_doc_type(pdf_doc)
  88. if doc_type == 'fitz':
  89. return extract_text_from_pdf_fitz(pdf_doc, page_idx, bbox, scale)
  90. else: # pypdfium2
  91. return extract_text_from_pdf_pypdfium2(pdf_doc, page_idx, bbox, scale)
  92. def extract_text_from_pdf_pypdfium2(
  93. pdf_doc: Any,
  94. page_idx: int,
  95. bbox: List[float],
  96. scale: float
  97. ) -> Tuple[str, bool]:
  98. """
  99. 使用 pypdfium2 从指定区域提取文本
  100. Args:
  101. pdf_doc: pypdfium2.PdfDocument 对象
  102. page_idx: 页码索引
  103. bbox: 目标区域的bbox(图像坐标)
  104. scale: 缩放比例
  105. Returns:
  106. (text, success)
  107. """
  108. if not MINERU_AVAILABLE or pdf_get_page_text is None:
  109. logger.error("MinerU pdf_text_tool not available")
  110. return "", False
  111. try:
  112. page = pdf_doc[page_idx]
  113. # 将图像坐标转换为PDF坐标
  114. pdf_bbox = [
  115. bbox[0] / scale,
  116. bbox[1] / scale,
  117. bbox[2] / scale,
  118. bbox[3] / scale
  119. ]
  120. # 使用 MinerU 的方式获取页面文本信息
  121. page_dict = pdf_get_page_text(page)
  122. # 从 blocks 中提取与 bbox 重叠的文本
  123. text_parts = []
  124. for block in page_dict.get('blocks', []):
  125. for line in block.get('lines', []):
  126. line_bbox = line.get('bbox')
  127. if line_bbox and hasattr(line_bbox, 'bbox'):
  128. line_bbox = line_bbox.bbox
  129. elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
  130. line_bbox = list(line_bbox)
  131. else:
  132. continue
  133. if bbox_overlap(pdf_bbox, line_bbox):
  134. for span in line.get('spans', []):
  135. span_text = span.get('text', '')
  136. if span_text:
  137. text_parts.append(span_text)
  138. text = ' '.join(text_parts)
  139. return text.strip(), bool(text.strip())
  140. except Exception as e:
  141. import traceback
  142. logger.debug(f"pypdfium2 text extraction error: {e}")
  143. logger.debug(traceback.format_exc())
  144. return "", False
  145. def extract_text_from_pdf_fitz(
  146. pdf_doc: Any,
  147. page_idx: int,
  148. bbox: List[float],
  149. scale: float
  150. ) -> Tuple[str, bool]:
  151. """
  152. 使用 fitz 从指定区域提取文本
  153. Args:
  154. pdf_doc: fitz.Document 对象
  155. page_idx: 页码索引
  156. bbox: 目标区域的bbox(图像坐标)
  157. scale: 缩放比例
  158. Returns:
  159. (text, success)
  160. """
  161. try:
  162. import fitz
  163. except ImportError:
  164. logger.error("PyMuPDF (fitz) not available")
  165. return "", False
  166. try:
  167. page = pdf_doc[page_idx]
  168. # 将图像坐标转换为PDF坐标
  169. pdf_bbox = fitz.Rect(
  170. bbox[0] / scale,
  171. bbox[1] / scale,
  172. bbox[2] / scale,
  173. bbox[3] / scale
  174. )
  175. # 提取区域内的文本
  176. text = page.get_text("text", clip=pdf_bbox)
  177. return text.strip(), bool(text.strip())
  178. except Exception as e:
  179. import traceback
  180. logger.debug(f"fitz text extraction error: {e}")
  181. logger.debug(traceback.format_exc())
  182. return "", False
  183. # ============================================================================
  184. # 全页文本提取
  185. # ============================================================================
  186. def extract_all_text_blocks(
  187. pdf_doc: Any,
  188. page_idx: int,
  189. scale: float,
  190. return_upright_coords: bool = True,
  191. ) -> Tuple[List[Dict[str, Any]], int]:
  192. """
  193. 提取页面所有文本块(支持 pypdfium2 和 fitz)
  194. **统一对外输出逻辑**:
  195. - return_upright_coords=True: 返回正视坐标(推荐,与OCR坐标系一致)
  196. - return_upright_coords=False: 返回旋转后坐标(与PDF rotation匹配)
  197. Args:
  198. pdf_doc: PDF文档对象
  199. page_idx: 页码索引(0-based)
  200. scale: 缩放比例
  201. return_upright_coords: 是否返回正视坐标
  202. True=正视坐标(默认,推荐)
  203. False=旋转后坐标
  204. Returns:
  205. (text_blocks, rotation_angle)
  206. - text_blocks: 文本块列表 [{'text': str, 'bbox': [x1, y1, x2, y2], 'origin_bbox': [...]}, ...]
  207. - rotation_angle: 图片旋转角度(0/90/180/270),逆时针定义
  208. """
  209. doc_type = detect_pdf_doc_type(pdf_doc)
  210. if doc_type == 'fitz':
  211. return extract_all_text_blocks_fitz(pdf_doc, page_idx, scale, return_upright_coords)
  212. else:
  213. return extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale, return_upright_coords)
  214. def extract_all_text_blocks_pypdfium2(
  215. pdf_doc: Any,
  216. page_idx: int,
  217. scale: float,
  218. return_upright_coords: bool = True
  219. ) -> Tuple[List[Dict[str, Any]], int]:
  220. """
  221. 使用 pypdfium2 提取所有文本块并处理rotation
  222. Args:
  223. pdf_doc: pypdfium2.PdfDocument 对象
  224. page_idx: 页码索引
  225. scale: 缩放比例
  226. return_upright_coords: 是否返回正视坐标(True=正视,False=旋转后)
  227. Returns:
  228. (text_blocks, rotation_angle)
  229. """
  230. if not MINERU_AVAILABLE or pdf_get_page_text is None:
  231. return [], 0
  232. try:
  233. page = pdf_doc[page_idx]
  234. page_dict = pdf_get_page_text(page)
  235. # 获取页面尺寸和rotation
  236. rotation = page_dict.get('rotation', 0)
  237. pdf_width = page_dict.get('width', 0)
  238. pdf_height = page_dict.get('height', 0)
  239. if rotation != 0:
  240. logger.info(f"📐 Page {page_idx}: PDF rotation={rotation}°, size=({pdf_width}x{pdf_height}), return_upright={return_upright_coords}")
  241. extracted_blocks = []
  242. for block in page_dict.get('blocks', []):
  243. for line in block.get('lines', []):
  244. line_text = ""
  245. for span in line.get('spans', []):
  246. line_text += span.get('text', "")
  247. if not line_text.strip():
  248. continue
  249. line_bbox = line.get('bbox')
  250. if line_bbox and hasattr(line_bbox, 'bbox'):
  251. line_bbox = line_bbox.bbox
  252. elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
  253. line_bbox = list(line_bbox)
  254. else:
  255. continue
  256. # pypdfium2返回旋转后坐标,根据 return_upright_coords 决定是否转换为正视坐标
  257. img_bbox = transform_bbox_for_rotation_pypdfium2(
  258. line_bbox, rotation, pdf_width, pdf_height, scale, to_upright=return_upright_coords
  259. )
  260. extracted_blocks.append({
  261. 'text': line_text,
  262. 'bbox': img_bbox,
  263. 'origin_bbox': line_bbox
  264. })
  265. # 转换为图片rotation(逆时针定义)
  266. image_rotation = pdf_rotation_to_image_rotation(rotation)
  267. return extracted_blocks, image_rotation
  268. except Exception as e:
  269. logger.warning(f"pypdfium2 extract_all_text_blocks failed: {e}")
  270. import traceback
  271. logger.debug(traceback.format_exc())
  272. return [], 0
  273. def extract_all_text_blocks_fitz(
  274. pdf_doc: Any,
  275. page_idx: int,
  276. scale: float,
  277. return_upright_coords: bool = True
  278. ) -> Tuple[List[Dict[str, Any]], int]:
  279. """
  280. 使用 fitz 提取所有文本块并处理rotation
  281. Args:
  282. pdf_doc: fitz.Document 对象
  283. page_idx: 页码索引
  284. scale: 缩放比例
  285. return_upright_coords: 是否返回正视坐标(True=正视,False=旋转后)
  286. Returns:
  287. (text_blocks, rotation_angle)
  288. """
  289. try:
  290. import fitz
  291. except ImportError:
  292. logger.warning("PyMuPDF (fitz) not available")
  293. return [], 0
  294. try:
  295. page = pdf_doc[page_idx]
  296. # 获取页面rotation
  297. rotation = page.rotation # 0, 90, 180, 270
  298. # 获取页面尺寸(原始方向,未旋转)
  299. # page.rect 是旋转后的尺寸,我们需要原始尺寸
  300. if rotation in [90, 270]:
  301. # 宽高互换回来
  302. pdf_width = page.rect.height
  303. pdf_height = page.rect.width
  304. else:
  305. pdf_width = page.rect.width
  306. pdf_height = page.rect.height
  307. if rotation != 0:
  308. logger.info(f"📐 Page {page_idx}: PDF rotation={rotation}°, original_size=({pdf_width}x{pdf_height}), return_upright={return_upright_coords}")
  309. # 使用 get_text("dict") 获取详细的文本信息
  310. text_dict = page.get_text("dict")
  311. extracted_blocks = []
  312. # 遍历所有 blocks
  313. for block in text_dict.get("blocks", []):
  314. # 只处理文本块(type=0)
  315. if block.get("type") != 0:
  316. continue
  317. # 遍历所有 lines
  318. for line in block.get("lines", []):
  319. line_text = ""
  320. line_bbox = line.get("bbox")
  321. # 提取 line 中的所有 span 文本
  322. for span in line.get("spans", []):
  323. line_text += span.get("text", "")
  324. if not line_text.strip() or not line_bbox:
  325. continue
  326. # fitz返回正视坐标,根据 return_upright_coords 决定是否转换为旋转后坐标
  327. to_rotated = not return_upright_coords # 反转逻辑
  328. img_bbox = transform_bbox_for_rotation_fitz(
  329. list(line_bbox), rotation, pdf_width, pdf_height, scale, to_rotated=to_rotated
  330. )
  331. extracted_blocks.append({
  332. 'text': line_text,
  333. 'bbox': img_bbox,
  334. 'origin_bbox': list(line_bbox)
  335. })
  336. # 转换为图片rotation(逆时针定义)
  337. image_rotation = pdf_rotation_to_image_rotation(rotation)
  338. return extracted_blocks, image_rotation
  339. except Exception as e:
  340. logger.warning(f"fitz extract_all_text_blocks failed: {e}")
  341. import traceback
  342. logger.debug(traceback.format_exc())
  343. return [], 0
  344. def get_page_rotation(pdf_doc: Any, page_idx: int) -> int:
  345. """
  346. 获取PDF页面的旋转角度(逆时针定义,用于图像旋转)
  347. 返回的角度可直接用于PIL.rotate()等图像旋转函数。
  348. Args:
  349. pdf_doc: PDF文档对象 (pypdfium2.PdfDocument 或 fitz.Document)
  350. page_idx: 页码索引(0-based)
  351. Returns:
  352. 旋转角度:0/90/180/270(逆时针旋转角度)
  353. Examples:
  354. >>> pdf_doc = fitz.open("test.pdf")
  355. >>> rotate_angle = get_page_rotation(pdf_doc, 0)
  356. >>> if rotate_angle != 0:
  357. >>> image = image.rotate(-rotate_angle, expand=True) # 旋转为正视
  358. """
  359. try:
  360. doc_type = detect_pdf_doc_type(pdf_doc)
  361. # 获取PDF的rotation属性(顺时针定义)
  362. if doc_type == "pypdfium2":
  363. pdf_rotation = pdf_doc[page_idx].get_rotation()
  364. else: # fitz
  365. pdf_rotation = pdf_doc[page_idx].rotation
  366. # 转换为图像rotation(逆时针定义)
  367. image_rotation = pdf_rotation_to_image_rotation(pdf_rotation)
  368. return image_rotation
  369. except Exception as e:
  370. logger.warning(f"Failed to get page rotation for page {page_idx}: {e}")
  371. return 0
  372. def detect_page_type(
  373. pdf_doc: Any,
  374. page_idx: int,
  375. char_threshold: int = 50
  376. ) -> str:
  377. """
  378. 检测PDF指定页是文字页还是图片页
  379. 基于字符密度的简单可靠方法
  380. """
  381. try:
  382. # 这里使用默认 with_rotation=True,因为只需要计数字符
  383. text_blocks, _ = extract_all_text_blocks(pdf_doc, page_idx, scale=1.0)
  384. total_chars = sum(len(block.get('text', '')) for block in text_blocks)
  385. return 'txt' if total_chars >= char_threshold else 'ocr'
  386. except:
  387. return 'ocr'