doc_utils.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. import fitz
  2. import numpy as np
  3. import enum
  4. from pydantic import BaseModel, Field
  5. from PIL import Image
  6. class SupportedPdfParseMethod(enum.Enum):
  7. OCR = 'ocr'
  8. TXT = 'txt'
  9. class PageInfo(BaseModel):
  10. """The width and height of page
  11. """
  12. w: float = Field(description='the width of page')
  13. h: float = Field(description='the height of page')
  14. def fitz_doc_to_image(doc, target_dpi=200, origin_dpi=None) -> Image.Image:
  15. """Convert fitz.Document to image, Then convert the image to numpy array.
  16. Args:
  17. doc (_type_): pymudoc page
  18. dpi (int, optional): reset the dpi of dpi. Defaults to 200.
  19. Returns:
  20. dict: {'img': numpy array, 'width': width, 'height': height }
  21. """
  22. from PIL import Image
  23. mat = fitz.Matrix(target_dpi / 72, target_dpi / 72)
  24. pm = doc.get_pixmap(matrix=mat, alpha=False)
  25. if pm.width > 4500 or pm.height > 4500:
  26. mat = fitz.Matrix(72 / 72, 72 / 72) # use fitz default dpi
  27. pm = doc.get_pixmap(matrix=mat, alpha=False)
  28. image = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
  29. return image
  30. def load_images_from_pdf(pdf_file, dpi=200, start_page_id=0, end_page_id=None) -> list:
  31. images = []
  32. with fitz.open(pdf_file) as doc:
  33. pdf_page_num = doc.page_count
  34. end_page_id = (
  35. end_page_id
  36. if end_page_id is not None and end_page_id >= 0
  37. else pdf_page_num - 1
  38. )
  39. if end_page_id > pdf_page_num - 1:
  40. print('end_page_id is out of range, use images length')
  41. end_page_id = pdf_page_num - 1
  42. for index in range(0, doc.page_count):
  43. if start_page_id <= index <= end_page_id:
  44. page = doc[index]
  45. img = fitz_doc_to_image(page, target_dpi=dpi)
  46. images.append(img)
  47. return images