pdf_page_tools.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. # Copyright (c) Opendatalab. All rights reserved.
  2. import io
  3. import pypdfium2 as pdfium
  4. from loguru import logger
  5. def get_end_page_id(end_page_id, pdf_page_num):
  6. end_page_id = end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1
  7. if end_page_id > pdf_page_num - 1:
  8. logger.warning("end_page_id is out of range, use images length")
  9. end_page_id = pdf_page_num - 1
  10. return end_page_id
  11. def convert_pdf_bytes_to_bytes_by_pypdfium2(pdf_bytes, start_page_id=0, end_page_id=None):
  12. pdf = pdfium.PdfDocument(pdf_bytes)
  13. output_pdf = pdfium.PdfDocument.new()
  14. try:
  15. end_page_id = get_end_page_id(end_page_id, len(pdf))
  16. # 选择要导入的页面索引
  17. page_indices = list(range(start_page_id, end_page_id + 1))
  18. # 从原PDF导入页面到新PDF
  19. output_pdf.import_pages(pdf, page_indices)
  20. # 将新PDF保存到内存缓冲区
  21. output_buffer = io.BytesIO()
  22. output_pdf.save(output_buffer)
  23. # 获取字节数据
  24. output_bytes = output_buffer.getvalue()
  25. except Exception as e:
  26. logger.warning(f"Error in converting PDF bytes: {e}, Using original PDF bytes.")
  27. output_bytes = pdf_bytes
  28. pdf.close()
  29. output_pdf.close()
  30. return output_bytes