| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194 |
- from abc import ABC, abstractmethod
- from typing import Iterator
- import fitz
- from magic_pdf.config.enums import SupportedPdfParseMethod
- from magic_pdf.data.schemas import PageInfo
- from magic_pdf.data.utils import fitz_doc_to_image
- class PageableData(ABC):
- @abstractmethod
- def get_image(self) -> dict:
- """Transform data to image."""
- pass
- @abstractmethod
- def get_doc(self) -> fitz.Page:
- """Get the pymudoc page."""
- pass
- @abstractmethod
- def get_page_info(self) -> PageInfo:
- """Get the page info of the page.
- Returns:
- PageInfo: the page info of this page
- """
- pass
- class Dataset(ABC):
- @abstractmethod
- def __len__(self) -> int:
- """The length of the dataset."""
- pass
- @abstractmethod
- def __iter__(self) -> Iterator[PageableData]:
- """Yield the page data."""
- pass
- @abstractmethod
- def supported_methods(self) -> list[SupportedPdfParseMethod]:
- """The methods that this dataset support.
- Returns:
- list[SupportedPdfParseMethod]: The supported methods, Valid methods are: OCR, TXT
- """
- pass
- @abstractmethod
- def data_bits(self) -> bytes:
- """The bits used to create this dataset."""
- pass
- @abstractmethod
- def get_page(self, page_id: int) -> PageableData:
- """Get the page indexed by page_id.
- Args:
- page_id (int): the index of the page
- Returns:
- PageableData: the page doc object
- """
- pass
- class PymuDocDataset(Dataset):
- def __init__(self, bits: bytes):
- """Initialize the dataset, which wraps the pymudoc documents.
- Args:
- bits (bytes): the bytes of the pdf
- """
- self._records = [Doc(v) for v in fitz.open('pdf', bits)]
- self._data_bits = bits
- self._raw_data = bits
- def __len__(self) -> int:
- """The page number of the pdf."""
- return len(self._records)
- def __iter__(self) -> Iterator[PageableData]:
- """Yield the page doc object."""
- return iter(self._records)
- def supported_methods(self) -> list[SupportedPdfParseMethod]:
- """The method supported by this dataset.
- Returns:
- list[SupportedPdfParseMethod]: the supported methods
- """
- return [SupportedPdfParseMethod.OCR, SupportedPdfParseMethod.TXT]
- def data_bits(self) -> bytes:
- """The pdf bits used to create this dataset."""
- return self._data_bits
- def get_page(self, page_id: int) -> PageableData:
- """The page doc object.
- Args:
- page_id (int): the page doc index
- Returns:
- PageableData: the page doc object
- """
- return self._records[page_id]
- class ImageDataset(Dataset):
- def __init__(self, bits: bytes):
- """Initialize the dataset, which wraps the pymudoc documents.
- Args:
- bits (bytes): the bytes of the photo which will be converted to pdf first. then converted to pymudoc.
- """
- pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
- self._records = [Doc(v) for v in fitz.open('pdf', pdf_bytes)]
- self._raw_data = bits
- self._data_bits = pdf_bytes
- def __len__(self) -> int:
- """The length of the dataset."""
- return len(self._records)
- def __iter__(self) -> Iterator[PageableData]:
- """Yield the page object."""
- return iter(self._records)
- def supported_methods(self):
- """The method supported by this dataset.
- Returns:
- list[SupportedPdfParseMethod]: the supported methods
- """
- return [SupportedPdfParseMethod.OCR]
- def data_bits(self) -> bytes:
- """The pdf bits used to create this dataset."""
- return self._data_bits
- def get_page(self, page_id: int) -> PageableData:
- """The page doc object.
- Args:
- page_id (int): the page doc index
- Returns:
- PageableData: the page doc object
- """
- return self._records[page_id]
- class Doc(PageableData):
- """Initialized with pymudoc object."""
- def __init__(self, doc: fitz.Page):
- self._doc = doc
- def get_image(self):
- """Return the imge info.
- Returns:
- dict: {
- img: np.ndarray,
- width: int,
- height: int
- }
- """
- return fitz_doc_to_image(self._doc)
- def get_doc(self) -> fitz.Page:
- """Get the pymudoc object.
- Returns:
- fitz.Page: the pymudoc object
- """
- return self._doc
- def get_page_info(self) -> PageInfo:
- """Get the page info of the page.
- Returns:
- PageInfo: the page info of this page
- """
- page_w = self._doc.rect.width
- page_h = self._doc.rect.height
- return PageInfo(w=page_w, h=page_h)
- def __getattr__(self, name):
- if hasattr(self._doc, name):
- return getattr(self._doc, name)
|