pdf_reader.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. import base64
  3. from io import BytesIO
  4. from loguru import logger
  5. from PIL import Image
  6. from pypdfium2 import PdfBitmap, PdfDocument, PdfPage
  7. def page_to_image(
  8. page: PdfPage,
  9. dpi: int = 200,
  10. max_width_or_height: int = 3500, # changed from 4500 to 3500
  11. ) -> (Image.Image, float):
  12. scale = dpi / 72
  13. long_side_length = max(*page.get_size())
  14. if (long_side_length*scale) > max_width_or_height:
  15. scale = max_width_or_height / long_side_length
  16. bitmap: PdfBitmap = page.render(scale=scale) # type: ignore
  17. image = bitmap.to_pil()
  18. bitmap.close()
  19. return image, scale
  20. def image_to_bytes(
  21. image: Image.Image,
  22. # image_format: str = "PNG", # 也可以用 "JPEG"
  23. image_format: str = "JPEG",
  24. ) -> bytes:
  25. with BytesIO() as image_buffer:
  26. image.save(image_buffer, format=image_format)
  27. return image_buffer.getvalue()
  28. def image_to_b64str(
  29. image: Image.Image,
  30. # image_format: str = "PNG", # 也可以用 "JPEG"
  31. image_format: str = "JPEG",
  32. ) -> str:
  33. image_bytes = image_to_bytes(image, image_format)
  34. return base64.b64encode(image_bytes).decode("utf-8")
  35. def base64_to_pil_image(
  36. base64_str: str,
  37. ) -> Image.Image:
  38. """Convert base64 string to PIL Image."""
  39. image_bytes = base64.b64decode(base64_str)
  40. with BytesIO(image_bytes) as image_buffer:
  41. return Image.open(image_buffer).convert("RGB")
  42. def pdf_to_images(
  43. pdf: str | bytes | PdfDocument,
  44. dpi: int = 200,
  45. max_width_or_height: int = 3500,
  46. start_page_id: int = 0,
  47. end_page_id: int | None = None,
  48. ) -> list[Image.Image]:
  49. doc = pdf if isinstance(pdf, PdfDocument) else PdfDocument(pdf)
  50. page_num = len(doc)
  51. end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else page_num - 1
  52. if end_page_id > page_num - 1:
  53. logger.warning("end_page_id is out of range, use images length")
  54. end_page_id = page_num - 1
  55. images = []
  56. try:
  57. for i in range(start_page_id, end_page_id + 1):
  58. image, _ = page_to_image(doc[i], dpi, max_width_or_height)
  59. images.append(image)
  60. finally:
  61. try:
  62. doc.close()
  63. except Exception:
  64. pass
  65. return images
  66. def pdf_to_images_bytes(
  67. pdf: str | bytes | PdfDocument,
  68. dpi: int = 200,
  69. max_width_or_height: int = 3500,
  70. start_page_id: int = 0,
  71. end_page_id: int | None = None,
  72. # image_format: str = "PNG", # 也可以用 "JPEG"
  73. image_format: str = "JPEG",
  74. ) -> list[bytes]:
  75. images = pdf_to_images(pdf, dpi, max_width_or_height, start_page_id, end_page_id)
  76. return [image_to_bytes(image, image_format) for image in images]
  77. def pdf_to_images_b64strs(
  78. pdf: str | bytes | PdfDocument,
  79. dpi: int = 200,
  80. max_width_or_height: int = 3500,
  81. start_page_id: int = 0,
  82. end_page_id: int | None = None,
  83. # image_format: str = "PNG", # 也可以用 "JPEG"
  84. image_format: str = "JPEG",
  85. ) -> list[str]:
  86. images = pdf_to_images(pdf, dpi, max_width_or_height, start_page_id, end_page_id)
  87. return [image_to_b64str(image, image_format) for image in images]