pdf_image_tools.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. from io import BytesIO
  3. import numpy as np
  4. import pypdfium2 as pdfium
  5. from loguru import logger
  6. from PIL import Image
  7. from mineru.data.data_reader_writer import FileBasedDataWriter
  8. from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image
  9. from .enum_class import ImageType
  10. from .hash_utils import str_sha256
  11. def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -> dict:
  12. """Convert pdfium.PdfDocument to image, Then convert the image to base64.
  13. Args:
  14. page (_type_): pdfium.PdfPage
  15. dpi (int, optional): reset the dpi of dpi. Defaults to 200.
  16. image_type (ImageType, optional): The type of image to return. Defaults to ImageType.PIL.
  17. Returns:
  18. dict: {'img_base64': str, 'img_pil': pil_img, 'scale': float }
  19. """
  20. pil_img, scale = page_to_image(page, dpi=dpi)
  21. image_dict = {
  22. "scale": scale,
  23. }
  24. if image_type == ImageType.BASE64:
  25. image_dict["img_base64"] = image_to_b64str(pil_img)
  26. else:
  27. image_dict["img_pil"] = pil_img
  28. return image_dict
  29. def load_images_from_pdf(
  30. pdf_bytes: bytes,
  31. dpi=200,
  32. start_page_id=0,
  33. end_page_id=None,
  34. image_type=ImageType.PIL, # PIL or BASE64
  35. ):
  36. images_list = []
  37. pdf_doc = pdfium.PdfDocument(pdf_bytes)
  38. pdf_page_num = len(pdf_doc)
  39. end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
  40. if end_page_id > pdf_page_num - 1:
  41. logger.warning("end_page_id is out of range, use images length")
  42. end_page_id = pdf_page_num - 1
  43. for index in range(0, pdf_page_num):
  44. if start_page_id <= index <= end_page_id:
  45. page = pdf_doc[index]
  46. image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type)
  47. images_list.append(image_dict)
  48. return images_list, pdf_doc
  49. def cut_image(bbox: tuple, page_num: int, page_pil_img, return_path, image_writer: FileBasedDataWriter, scale=2):
  50. """从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径 save_path:需要同时支持s3和本地,
  51. 图片存放在save_path下,文件名是:
  52. {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。"""
  53. # 拼接文件名
  54. filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}"
  55. # 老版本返回不带bucket的路径
  56. img_path = f"{return_path}_{filename}" if return_path is not None else None
  57. # 新版本生成平铺路径
  58. img_hash256_path = f"{str_sha256(img_path)}.jpg"
  59. # img_hash256_path = f'{img_path}.jpg'
  60. crop_img = get_crop_img(bbox, page_pil_img, scale=scale)
  61. img_bytes = image_to_bytes(crop_img, image_format="JPEG")
  62. image_writer.write(img_hash256_path, img_bytes)
  63. return img_hash256_path
  64. def get_crop_img(bbox: tuple, pil_img, scale=2):
  65. scale_bbox = (
  66. int(bbox[0] * scale),
  67. int(bbox[1] * scale),
  68. int(bbox[2] * scale),
  69. int(bbox[3] * scale),
  70. )
  71. return pil_img.crop(scale_bbox)
  72. def get_crop_np_img(bbox: tuple, input_img, scale=2):
  73. if isinstance(input_img, Image.Image):
  74. np_img = np.asarray(input_img)
  75. elif isinstance(input_img, np.ndarray):
  76. np_img = input_img
  77. else:
  78. raise ValueError("Input must be a pillow object or a numpy array.")
  79. scale_bbox = (
  80. int(bbox[0] * scale),
  81. int(bbox[1] * scale),
  82. int(bbox[2] * scale),
  83. int(bbox[3] * scale),
  84. )
  85. return np_img[scale_bbox[1]:scale_bbox[3], scale_bbox[0]:scale_bbox[2]]
  86. def images_bytes_to_pdf_bytes(image_bytes):
  87. # 内存缓冲区
  88. pdf_buffer = BytesIO()
  89. # 载入并转换所有图像为 RGB 模式
  90. image = Image.open(BytesIO(image_bytes)).convert("RGB")
  91. # 第一张图保存为 PDF,其余追加
  92. image.save(pdf_buffer, format="PDF", save_all=True)
  93. # 获取 PDF bytes 并重置指针(可选)
  94. pdf_bytes = pdf_buffer.getvalue()
  95. pdf_buffer.close()
  96. return pdf_bytes