| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960 |
- import fitz
- import numpy as np
- import enum
- from pydantic import BaseModel, Field
- from PIL import Image
- class SupportedPdfParseMethod(enum.Enum):
- OCR = 'ocr'
- TXT = 'txt'
- class PageInfo(BaseModel):
- """The width and height of page
- """
- w: float = Field(description='the width of page')
- h: float = Field(description='the height of page')
- def fitz_doc_to_image(doc, target_dpi=200, origin_dpi=None) -> dict:
- """Convert fitz.Document to image, Then convert the image to numpy array.
- Args:
- doc (_type_): pymudoc page
- dpi (int, optional): reset the dpi of dpi. Defaults to 200.
- Returns:
- dict: {'img': numpy array, 'width': width, 'height': height }
- """
- from PIL import Image
- mat = fitz.Matrix(target_dpi / 72, target_dpi / 72)
- pm = doc.get_pixmap(matrix=mat, alpha=False)
- if pm.width > 4500 or pm.height > 4500:
- mat = fitz.Matrix(72 / 72, 72 / 72) # use fitz default dpi
- pm = doc.get_pixmap(matrix=mat, alpha=False)
- image = Image.frombytes('RGB', (pm.width, pm.height), pm.samples)
- return image
- def load_images_from_pdf(pdf_file, dpi=200, start_page_id=0, end_page_id=None) -> list:
- images = []
- with fitz.open(pdf_file) as doc:
- pdf_page_num = doc.page_count
- end_page_id = (
- end_page_id
- if end_page_id is not None and end_page_id >= 0
- else pdf_page_num - 1
- )
- if end_page_id > pdf_page_num - 1:
- print('end_page_id is out of range, use images length')
- end_page_id = pdf_page_num - 1
- for index in range(0, doc.page_count):
- if start_page_id <= index <= end_page_id:
- page = doc[index]
- img = fitz_doc_to_image(page, target_dpi=dpi)
- images.append(img)
- return images
|