import fitz import numpy as np import enum from pydantic import BaseModel, Field from PIL import Image class SupportedPdfParseMethod(enum.Enum): OCR = 'ocr' TXT = 'txt' class PageInfo(BaseModel): """The width and height of page """ w: float = Field(description='the width of page') h: float = Field(description='the height of page') def fitz_doc_to_image(doc, target_dpi=200, origin_dpi=None) -> dict: """Convert fitz.Document to image, Then convert the image to numpy array. Args: doc (_type_): pymudoc page dpi (int, optional): reset the dpi of dpi. Defaults to 200. Returns: dict: {'img': numpy array, 'width': width, 'height': height } """ from PIL import Image mat = fitz.Matrix(target_dpi / 72, target_dpi / 72) pm = doc.get_pixmap(matrix=mat, alpha=False) if pm.width > 4500 or pm.height > 4500: mat = fitz.Matrix(72 / 72, 72 / 72) # use fitz default dpi pm = doc.get_pixmap(matrix=mat, alpha=False) image = Image.frombytes('RGB', (pm.width, pm.height), pm.samples) return image def load_images_from_pdf(pdf_file, dpi=200, start_page_id=0, end_page_id=None) -> list: images = [] with fitz.open(pdf_file) as doc: pdf_page_num = doc.page_count end_page_id = ( end_page_id if end_page_id is not None and end_page_id >= 0 else pdf_page_num - 1 ) if end_page_id > pdf_page_num - 1: print('end_page_id is out of range, use images length') end_page_id = pdf_page_num - 1 for index in range(0, doc.page_count): if start_page_id <= index <= end_page_id: page = doc[index] img = fitz_doc_to_image(page, target_dpi=dpi) images.append(img) return images