pdf_text_extraction.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. """
  2. PDF文本提取模块
  3. 提供从PDF文档中提取文本的功能,支持多种PDF引擎:
  4. - pypdfium2: MinerU标准引擎
  5. - fitz (PyMuPDF): 轻量级替代引擎
  6. 主要功能:
  7. - 区域文本提取:从指定bbox区域提取文本
  8. - 全页文本提取:提取页面所有文本块及其坐标
  9. - 自动rotation处理:自动应用PDF页面旋转变换
  10. - 返回图片rotation(逆时针定义):对外统一使用图片处理标准
  11. """
  12. from typing import Dict, List, Any, Tuple
  13. from loguru import logger
  14. # 导入坐标转换函数
  15. from .pdf_coordinate_transform import (
  16. transform_bbox_for_rotation_fitz,
  17. transform_bbox_for_rotation_pypdfium2,
  18. pdf_rotation_to_image_rotation
  19. )
  20. # 导入 MinerU 组件
  21. try:
  22. from mineru.utils.pdf_text_tool import get_page as pdf_get_page_text
  23. MINERU_AVAILABLE = True
  24. except ImportError:
  25. pdf_get_page_text = None
  26. MINERU_AVAILABLE = False
  27. def detect_pdf_doc_type(pdf_doc: Any) -> str:
  28. """
  29. 检测 PDF 文档对象类型
  30. Args:
  31. pdf_doc: PDF 文档对象
  32. Returns:
  33. 'pypdfium2' 或 'fitz'
  34. """
  35. doc_type_name = type(pdf_doc).__name__
  36. doc_module = type(pdf_doc).__module__
  37. if 'pdfium' in doc_module.lower() or 'PdfDocument' in doc_type_name:
  38. return 'pypdfium2'
  39. elif 'fitz' in doc_module.lower() or 'Document' in doc_type_name:
  40. return 'fitz'
  41. else:
  42. # 尝试通过属性判断
  43. if hasattr(pdf_doc, 'get_page') or hasattr(pdf_doc, 'page_count'):
  44. return 'fitz'
  45. else:
  46. return 'pypdfium2'
  47. def bbox_overlap(bbox1: List[float], bbox2: List[float]) -> bool:
  48. """
  49. 检查两个 bbox 是否重叠
  50. Args:
  51. bbox1: 第一个bbox [x1, y1, x2, y2]
  52. bbox2: 第二个bbox [x1, y1, x2, y2]
  53. Returns:
  54. True 如果重叠,否则 False
  55. """
  56. if len(bbox1) < 4 or len(bbox2) < 4:
  57. return False
  58. x1_1, y1_1, x2_1, y2_1 = bbox1[:4]
  59. x1_2, y1_2, x2_2, y2_2 = bbox2[:4]
  60. # 检查是否不重叠(取反)
  61. if x2_1 < x1_2 or x2_2 < x1_1:
  62. return False
  63. if y2_1 < y1_2 or y2_2 < y1_1:
  64. return False
  65. return True
  66. # ============================================================================
  67. # 区域文本提取
  68. # ============================================================================
  69. def extract_text_from_pdf(
  70. pdf_doc: Any,
  71. page_idx: int,
  72. bbox: List[float],
  73. scale: float
  74. ) -> Tuple[str, bool]:
  75. """
  76. 从PDF指定区域提取文本(支持 pypdfium2 和 fitz)
  77. Args:
  78. pdf_doc: PDF文档对象 (pypdfium2.PdfDocument 或 fitz.Document)
  79. page_idx: 页码索引(0-based)
  80. bbox: 目标区域的bbox(图像坐标)[x1, y1, x2, y2]
  81. scale: 图像与PDF的缩放比例
  82. Returns:
  83. (text, success)
  84. - text: 提取的文本
  85. - success: 是否成功提取到文本
  86. """
  87. doc_type = detect_pdf_doc_type(pdf_doc)
  88. if doc_type == 'fitz':
  89. return extract_text_from_pdf_fitz(pdf_doc, page_idx, bbox, scale)
  90. else: # pypdfium2
  91. return extract_text_from_pdf_pypdfium2(pdf_doc, page_idx, bbox, scale)
  92. def extract_text_from_pdf_pypdfium2(
  93. pdf_doc: Any,
  94. page_idx: int,
  95. bbox: List[float],
  96. scale: float
  97. ) -> Tuple[str, bool]:
  98. """
  99. 使用 pypdfium2 从指定区域提取文本
  100. Args:
  101. pdf_doc: pypdfium2.PdfDocument 对象
  102. page_idx: 页码索引
  103. bbox: 目标区域的bbox(图像坐标)
  104. scale: 缩放比例
  105. Returns:
  106. (text, success)
  107. """
  108. if not MINERU_AVAILABLE or pdf_get_page_text is None:
  109. logger.error("MinerU pdf_text_tool not available")
  110. return "", False
  111. try:
  112. page = pdf_doc[page_idx]
  113. # 将图像坐标转换为PDF坐标
  114. pdf_bbox = [
  115. bbox[0] / scale,
  116. bbox[1] / scale,
  117. bbox[2] / scale,
  118. bbox[3] / scale
  119. ]
  120. # 使用 MinerU 的方式获取页面文本信息
  121. page_dict = pdf_get_page_text(page)
  122. # 从 blocks 中提取与 bbox 重叠的文本
  123. text_parts = []
  124. for block in page_dict.get('blocks', []):
  125. for line in block.get('lines', []):
  126. line_bbox = line.get('bbox')
  127. if line_bbox and hasattr(line_bbox, 'bbox'):
  128. line_bbox = line_bbox.bbox
  129. elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
  130. line_bbox = list(line_bbox)
  131. else:
  132. continue
  133. if bbox_overlap(pdf_bbox, line_bbox):
  134. for span in line.get('spans', []):
  135. span_text = span.get('text', '')
  136. if span_text:
  137. text_parts.append(span_text)
  138. text = ' '.join(text_parts)
  139. return text.strip(), bool(text.strip())
  140. except Exception as e:
  141. import traceback
  142. logger.debug(f"pypdfium2 text extraction error: {e}")
  143. logger.debug(traceback.format_exc())
  144. return "", False
  145. def extract_text_from_pdf_fitz(
  146. pdf_doc: Any,
  147. page_idx: int,
  148. bbox: List[float],
  149. scale: float
  150. ) -> Tuple[str, bool]:
  151. """
  152. 使用 fitz 从指定区域提取文本
  153. Args:
  154. pdf_doc: fitz.Document 对象
  155. page_idx: 页码索引
  156. bbox: 目标区域的bbox(图像坐标)
  157. scale: 缩放比例
  158. Returns:
  159. (text, success)
  160. """
  161. try:
  162. import fitz
  163. except ImportError:
  164. logger.error("PyMuPDF (fitz) not available")
  165. return "", False
  166. try:
  167. page = pdf_doc[page_idx]
  168. # 将图像坐标转换为PDF坐标
  169. pdf_bbox = fitz.Rect(
  170. bbox[0] / scale,
  171. bbox[1] / scale,
  172. bbox[2] / scale,
  173. bbox[3] / scale
  174. )
  175. # 提取区域内的文本
  176. text = page.get_text("text", clip=pdf_bbox)
  177. return text.strip(), bool(text.strip())
  178. except Exception as e:
  179. import traceback
  180. logger.debug(f"fitz text extraction error: {e}")
  181. logger.debug(traceback.format_exc())
  182. return "", False
  183. # ============================================================================
  184. # 全页文本提取
  185. # ============================================================================
  186. def extract_all_text_blocks(
  187. pdf_doc: Any,
  188. page_idx: int,
  189. scale: float
  190. ) -> Tuple[List[Dict[str, Any]], int]:
  191. """
  192. 提取页面所有文本块(支持 pypdfium2 和 fitz)+ PDF rotation处理
  193. Args:
  194. pdf_doc: PDF文档对象
  195. page_idx: 页码索引(0-based)
  196. scale: 缩放比例
  197. Returns:
  198. (text_blocks, rotation_angle)
  199. - text_blocks: 文本块列表 [{'text': str, 'bbox': [x1, y1, x2, y2], 'origin_bbox': [...]}, ...]
  200. bbox坐标已转换为渲染图像坐标系(与OCR坐标系一致)
  201. - rotation_angle: 图片旋转角度(0/90/180/270),逆时针定义
  202. """
  203. doc_type = detect_pdf_doc_type(pdf_doc)
  204. if doc_type == 'fitz':
  205. return extract_all_text_blocks_fitz(pdf_doc, page_idx, scale)
  206. else:
  207. return extract_all_text_blocks_pypdfium2(pdf_doc, page_idx, scale)
  208. def extract_all_text_blocks_pypdfium2(
  209. pdf_doc: Any,
  210. page_idx: int,
  211. scale: float
  212. ) -> Tuple[List[Dict[str, Any]], int]:
  213. """
  214. 使用 pypdfium2 提取所有文本块并处理rotation
  215. Args:
  216. pdf_doc: pypdfium2.PdfDocument 对象
  217. page_idx: 页码索引
  218. scale: 缩放比例
  219. Returns:
  220. (text_blocks, rotation_angle)
  221. """
  222. if not MINERU_AVAILABLE or pdf_get_page_text is None:
  223. return [], 0
  224. try:
  225. page = pdf_doc[page_idx]
  226. page_dict = pdf_get_page_text(page)
  227. # 获取页面尺寸和rotation
  228. rotation = page_dict.get('rotation', 0)
  229. pdf_width = page_dict.get('width', 0)
  230. pdf_height = page_dict.get('height', 0)
  231. if rotation != 0:
  232. logger.info(f"📐 Page {page_idx}: PDF rotation={rotation}°, size=({pdf_width}x{pdf_height})")
  233. extracted_blocks = []
  234. for block in page_dict.get('blocks', []):
  235. for line in block.get('lines', []):
  236. line_text = ""
  237. for span in line.get('spans', []):
  238. line_text += span.get('text', "")
  239. if not line_text.strip():
  240. continue
  241. line_bbox = line.get('bbox')
  242. if line_bbox and hasattr(line_bbox, 'bbox'):
  243. line_bbox = line_bbox.bbox
  244. elif isinstance(line_bbox, (list, tuple)) and len(line_bbox) >= 4:
  245. line_bbox = list(line_bbox)
  246. else:
  247. continue
  248. # 应用rotation坐标转换
  249. img_bbox = transform_bbox_for_rotation_pypdfium2(
  250. line_bbox, rotation, pdf_width, pdf_height, scale
  251. )
  252. extracted_blocks.append({
  253. 'text': line_text,
  254. 'bbox': img_bbox,
  255. 'origin_bbox': line_bbox
  256. })
  257. # 转换为图片rotation(逆时针定义)
  258. image_rotation = pdf_rotation_to_image_rotation(rotation)
  259. return extracted_blocks, image_rotation
  260. except Exception as e:
  261. logger.warning(f"pypdfium2 extract_all_text_blocks failed: {e}")
  262. import traceback
  263. logger.debug(traceback.format_exc())
  264. return [], 0
  265. def extract_all_text_blocks_fitz(
  266. pdf_doc: Any,
  267. page_idx: int,
  268. scale: float
  269. ) -> Tuple[List[Dict[str, Any]], int]:
  270. """
  271. 使用 fitz 提取所有文本块并处理rotation
  272. Args:
  273. pdf_doc: fitz.Document 对象
  274. page_idx: 页码索引
  275. scale: 缩放比例
  276. Returns:
  277. (text_blocks, rotation_angle)
  278. """
  279. try:
  280. import fitz
  281. except ImportError:
  282. logger.warning("PyMuPDF (fitz) not available")
  283. return [], 0
  284. try:
  285. page = pdf_doc[page_idx]
  286. # 获取页面rotation
  287. rotation = page.rotation # 0, 90, 180, 270
  288. # 获取页面尺寸(原始方向,未旋转)
  289. # page.rect 是旋转后的尺寸,我们需要原始尺寸
  290. if rotation in [90, 270]:
  291. # 宽高互换回来
  292. pdf_width = page.rect.height
  293. pdf_height = page.rect.width
  294. else:
  295. pdf_width = page.rect.width
  296. pdf_height = page.rect.height
  297. if rotation != 0:
  298. logger.info(f"📐 Page {page_idx}: PDF rotation={rotation}°, original_size=({pdf_width}x{pdf_height})")
  299. # 使用 get_text("dict") 获取详细的文本信息
  300. text_dict = page.get_text("dict")
  301. extracted_blocks = []
  302. # 遍历所有 blocks
  303. for block in text_dict.get("blocks", []):
  304. # 只处理文本块(type=0)
  305. if block.get("type") != 0:
  306. continue
  307. # 遍历所有 lines
  308. for line in block.get("lines", []):
  309. line_text = ""
  310. line_bbox = line.get("bbox")
  311. # 提取 line 中的所有 span 文本
  312. for span in line.get("spans", []):
  313. line_text += span.get("text", "")
  314. if not line_text.strip() or not line_bbox:
  315. continue
  316. # 应用rotation坐标转换
  317. img_bbox = transform_bbox_for_rotation_fitz(
  318. list(line_bbox), rotation, pdf_width, pdf_height, scale
  319. )
  320. extracted_blocks.append({
  321. 'text': line_text,
  322. 'bbox': img_bbox,
  323. 'origin_bbox': list(line_bbox)
  324. })
  325. # 转换为图片rotation(逆时针定义)
  326. image_rotation = pdf_rotation_to_image_rotation(rotation)
  327. return extracted_blocks, image_rotation
  328. except Exception as e:
  329. logger.warning(f"fitz extract_all_text_blocks failed: {e}")
  330. import traceback
  331. logger.debug(traceback.format_exc())
  332. return [], 0
  333. def detect_page_type(
  334. pdf_doc: Any,
  335. page_idx: int,
  336. char_threshold: int = 50
  337. ) -> str:
  338. """
  339. 检测PDF指定页是文字页还是图片页
  340. 基于字符密度的简单可靠方法
  341. """
  342. try:
  343. text_blocks, _ = extract_all_text_blocks(pdf_doc, page_idx, scale=1.0)
  344. total_chars = sum(len(block.get('text', '')) for block in text_blocks)
  345. return 'txt' if total_chars >= char_threshold else 'ocr'
  346. except:
  347. return 'ocr'