pdf_reader.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. import base64
  3. from io import BytesIO
  4. from loguru import logger
  5. from PIL import Image
  6. from pypdfium2 import PdfBitmap, PdfDocument, PdfPage
  7. def page_to_image(
  8. page: PdfPage,
  9. dpi: int = 200,
  10. max_width_or_height: int = 3500, # changed from 4500 to 3500
  11. ) -> (Image.Image, float):
  12. scale = dpi / 72
  13. long_side_length = max(*page.get_size())
  14. if (long_side_length*scale) > max_width_or_height:
  15. scale = max_width_or_height / long_side_length
  16. bitmap: PdfBitmap = page.render(scale=scale) # type: ignore
  17. image = bitmap.to_pil()
  18. try:
  19. bitmap.close()
  20. except Exception as e:
  21. logger.error(f"Failed to close bitmap: {e}")
  22. return image, scale
  23. def image_to_bytes(
  24. image: Image.Image,
  25. # image_format: str = "PNG", # 也可以用 "JPEG"
  26. image_format: str = "JPEG",
  27. ) -> bytes:
  28. with BytesIO() as image_buffer:
  29. image.save(image_buffer, format=image_format)
  30. return image_buffer.getvalue()
  31. def image_to_b64str(
  32. image: Image.Image,
  33. # image_format: str = "PNG", # 也可以用 "JPEG"
  34. image_format: str = "JPEG",
  35. ) -> str:
  36. image_bytes = image_to_bytes(image, image_format)
  37. return base64.b64encode(image_bytes).decode("utf-8")
  38. def base64_to_pil_image(
  39. base64_str: str,
  40. ) -> Image.Image:
  41. """Convert base64 string to PIL Image."""
  42. image_bytes = base64.b64decode(base64_str)
  43. with BytesIO(image_bytes) as image_buffer:
  44. return Image.open(image_buffer).convert("RGB")
  45. def pdf_to_images(
  46. pdf: str | bytes | PdfDocument,
  47. dpi: int = 200,
  48. max_width_or_height: int = 3500,
  49. start_page_id: int = 0,
  50. end_page_id: int | None = None,
  51. ) -> list[Image.Image]:
  52. doc = pdf if isinstance(pdf, PdfDocument) else PdfDocument(pdf)
  53. page_num = len(doc)
  54. end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else page_num - 1
  55. if end_page_id > page_num - 1:
  56. logger.warning("end_page_id is out of range, use images length")
  57. end_page_id = page_num - 1
  58. images = []
  59. try:
  60. for i in range(start_page_id, end_page_id + 1):
  61. image, _ = page_to_image(doc[i], dpi, max_width_or_height)
  62. images.append(image)
  63. finally:
  64. try:
  65. doc.close()
  66. except Exception:
  67. pass
  68. return images
  69. def pdf_to_images_bytes(
  70. pdf: str | bytes | PdfDocument,
  71. dpi: int = 200,
  72. max_width_or_height: int = 3500,
  73. start_page_id: int = 0,
  74. end_page_id: int | None = None,
  75. # image_format: str = "PNG", # 也可以用 "JPEG"
  76. image_format: str = "JPEG",
  77. ) -> list[bytes]:
  78. images = pdf_to_images(pdf, dpi, max_width_or_height, start_page_id, end_page_id)
  79. return [image_to_bytes(image, image_format) for image in images]
  80. def pdf_to_images_b64strs(
  81. pdf: str | bytes | PdfDocument,
  82. dpi: int = 200,
  83. max_width_or_height: int = 3500,
  84. start_page_id: int = 0,
  85. end_page_id: int | None = None,
  86. # image_format: str = "PNG", # 也可以用 "JPEG"
  87. image_format: str = "JPEG",
  88. ) -> list[str]:
  89. images = pdf_to_images(pdf, dpi, max_width_or_height, start_page_id, end_page_id)
  90. return [image_to_b64str(image, image_format) for image in images]