pdf_reader.py 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. import base64
  3. from io import BytesIO
  4. from loguru import logger
  5. from PIL import Image
  6. from pypdfium2 import PdfBitmap, PdfDocument, PdfPage
  7. def page_to_image(
  8. page: PdfPage,
  9. dpi: int = 144, # changed from 200 to 144
  10. max_width_or_height: int = 2560, # changed from 4500 to 2560
  11. ) -> (Image.Image, float):
  12. scale = dpi / 72
  13. long_side_length = max(*page.get_size())
  14. if long_side_length > max_width_or_height:
  15. scale = max_width_or_height / long_side_length
  16. bitmap: PdfBitmap = page.render(scale=scale) # type: ignore
  17. try:
  18. image = bitmap.to_pil()
  19. finally:
  20. try:
  21. bitmap.close()
  22. except Exception:
  23. pass
  24. return image, scale
  25. def image_to_bytes(
  26. image: Image.Image,
  27. image_format: str = "PNG", # 也可以用 "JPEG"
  28. ) -> bytes:
  29. with BytesIO() as image_buffer:
  30. image.save(image_buffer, format=image_format)
  31. return image_buffer.getvalue()
  32. def image_to_b64str(
  33. image: Image.Image,
  34. image_format: str = "PNG", # 也可以用 "JPEG"
  35. ) -> str:
  36. image_bytes = image_to_bytes(image, image_format)
  37. return base64.b64encode(image_bytes).decode("utf-8")
  38. def pdf_to_images(
  39. pdf: str | bytes | PdfDocument,
  40. dpi: int = 144,
  41. max_width_or_height: int = 2560,
  42. start_page_id: int = 0,
  43. end_page_id: int | None = None,
  44. ) -> list[Image.Image]:
  45. doc = pdf if isinstance(pdf, PdfDocument) else PdfDocument(pdf)
  46. page_num = len(doc)
  47. end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else page_num - 1
  48. if end_page_id > page_num - 1:
  49. logger.warning("end_page_id is out of range, use images length")
  50. end_page_id = page_num - 1
  51. images = []
  52. try:
  53. for i in range(start_page_id, end_page_id + 1):
  54. image, _ = page_to_image(doc[i], dpi, max_width_or_height)
  55. images.append(image)
  56. finally:
  57. try:
  58. doc.close()
  59. except Exception:
  60. pass
  61. return images
  62. def pdf_to_images_bytes(
  63. pdf: str | bytes | PdfDocument,
  64. dpi: int = 144,
  65. max_width_or_height: int = 2560,
  66. start_page_id: int = 0,
  67. end_page_id: int | None = None,
  68. image_format: str = "PNG",
  69. ) -> list[bytes]:
  70. images = pdf_to_images(pdf, dpi, max_width_or_height, start_page_id, end_page_id)
  71. return [image_to_bytes(image, image_format) for image in images]
  72. def pdf_to_images_b64strs(
  73. pdf: str | bytes | PdfDocument,
  74. dpi: int = 144,
  75. max_width_or_height: int = 2560,
  76. start_page_id: int = 0,
  77. end_page_id: int | None = None,
  78. image_format: str = "PNG",
  79. ) -> list[str]:
  80. images = pdf_to_images(pdf, dpi, max_width_or_height, start_page_id, end_page_id)
  81. return [image_to_b64str(image, image_format) for image in images]