pdf_image_tools.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. from io import BytesIO
  3. import pypdfium2 as pdfium
  4. from loguru import logger
  5. from PIL import Image
  6. from mineru.data.data_reader_writer import FileBasedDataWriter
  7. from mineru.utils.pdf_reader import image_to_b64str, image_to_bytes, page_to_image
  8. from .enum_class import ImageType
  9. from .hash_utils import str_sha256
  10. def pdf_page_to_image(page: pdfium.PdfPage, dpi=200, image_type=ImageType.PIL) -> dict:
  11. """Convert pdfium.PdfDocument to image, Then convert the image to base64.
  12. Args:
  13. page (_type_): pdfium.PdfPage
  14. dpi (int, optional): reset the dpi of dpi. Defaults to 200.
  15. image_type (ImageType, optional): The type of image to return. Defaults to ImageType.PIL.
  16. Returns:
  17. dict: {'img_base64': str, 'img_pil': pil_img, 'scale': float }
  18. """
  19. pil_img, scale = page_to_image(page, dpi=dpi)
  20. image_dict = {
  21. "scale": scale,
  22. }
  23. if image_type == ImageType.BASE64:
  24. image_dict["img_base64"] = image_to_b64str(pil_img)
  25. else:
  26. image_dict["img_pil"] = pil_img
  27. return image_dict
  28. def load_images_from_pdf(
  29. pdf_bytes: bytes,
  30. dpi=200,
  31. start_page_id=0,
  32. end_page_id=None,
  33. image_type=ImageType.PIL, # PIL or BASE64
  34. ):
  35. images_list = []
  36. pdf_doc = pdfium.PdfDocument(pdf_bytes)
  37. pdf_page_num = len(pdf_doc)
  38. end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
  39. if end_page_id > pdf_page_num - 1:
  40. logger.warning("end_page_id is out of range, use images length")
  41. end_page_id = pdf_page_num - 1
  42. for index in range(0, pdf_page_num):
  43. if start_page_id <= index <= end_page_id:
  44. page = pdf_doc[index]
  45. image_dict = pdf_page_to_image(page, dpi=dpi, image_type=image_type)
  46. images_list.append(image_dict)
  47. return images_list, pdf_doc
  48. def cut_image(bbox: tuple, page_num: int, page_pil_img, return_path, image_writer: FileBasedDataWriter, scale=2):
  49. """从第page_num页的page中,根据bbox进行裁剪出一张jpg图片,返回图片路径 save_path:需要同时支持s3和本地,
  50. 图片存放在save_path下,文件名是:
  51. {page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg , bbox内数字取整。"""
  52. # 拼接文件名
  53. filename = f"{page_num}_{int(bbox[0])}_{int(bbox[1])}_{int(bbox[2])}_{int(bbox[3])}"
  54. # 老版本返回不带bucket的路径
  55. img_path = f"{return_path}_{filename}" if return_path is not None else None
  56. # 新版本生成平铺路径
  57. img_hash256_path = f"{str_sha256(img_path)}.jpg"
  58. # img_hash256_path = f'{img_path}.jpg'
  59. crop_img = get_crop_img(bbox, page_pil_img, scale=scale)
  60. img_bytes = image_to_bytes(crop_img, image_format="JPEG")
  61. image_writer.write(img_hash256_path, img_bytes)
  62. return img_hash256_path
  63. def get_crop_img(bbox: tuple, pil_img, scale=2):
  64. scale_bbox = (
  65. int(bbox[0] * scale),
  66. int(bbox[1] * scale),
  67. int(bbox[2] * scale),
  68. int(bbox[3] * scale),
  69. )
  70. return pil_img.crop(scale_bbox)
  71. def images_bytes_to_pdf_bytes(image_bytes):
  72. # 内存缓冲区
  73. pdf_buffer = BytesIO()
  74. # 载入并转换所有图像为 RGB 模式
  75. image = Image.open(BytesIO(image_bytes)).convert("RGB")
  76. # 第一张图保存为 PDF,其余追加
  77. image.save(pdf_buffer, format="PDF", save_all=True)
  78. # 获取 PDF bytes 并重置指针(可选)
  79. pdf_bytes = pdf_buffer.getvalue()
  80. pdf_buffer.close()
  81. return pdf_bytes