zhengchun
/
MinerU


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
							import os
from abc import ABC, abstractmethod
from typing import Callable, Iterator

import fitz
from loguru import logger

from magic_pdf.config.enums import SupportedPdfParseMethod
from magic_pdf.data.schemas import PageInfo
from magic_pdf.data.utils import fitz_doc_to_image
from magic_pdf.filter import classify


class PageableData(ABC):
    @abstractmethod
    def get_image(self) -> dict:
        """Transform data to image."""
        pass

    @abstractmethod
    def get_doc(self) -> fitz.Page:
        """Get the pymudoc page."""
        pass

    @abstractmethod
    def get_page_info(self) -> PageInfo:
        """Get the page info of the page.

        Returns:
            PageInfo: the page info of this page
        """
        pass

    @abstractmethod
    def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
        """draw rectangle.

        Args:
            rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
            color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
            fill (list[float] | None): fill the board with RGB, None means will not fill with color
            fill_opacity (float): opacity of the fill, range from [0, 1]
            width (float): the width of board
            overlay (bool): fill the color in foreground or background. True means fill in background.
        """
        pass

    @abstractmethod
    def insert_text(self, coord, content, fontsize, color):
        """insert text.

        Args:
            coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
            content (str): the text content
            fontsize (int): font size of the text
            color (list[float] | None):  three element tuple which describe the RGB of the board line, None will use the default font color!
        """
        pass


class Dataset(ABC):
    @abstractmethod
    def __len__(self) -> int:
        """The length of the dataset."""
        pass

    @abstractmethod
    def __iter__(self) -> Iterator[PageableData]:
        """Yield the page data."""
        pass

    @abstractmethod
    def supported_methods(self) -> list[SupportedPdfParseMethod]:
        """The methods that this dataset support.

        Returns:
            list[SupportedPdfParseMethod]: The supported methods, Valid methods are: OCR, TXT
        """
        pass

    @abstractmethod
    def data_bits(self) -> bytes:
        """The bits used to create this dataset."""
        pass

    @abstractmethod
    def get_page(self, page_id: int) -> PageableData:
        """Get the page indexed by page_id.

        Args:
            page_id (int): the index of the page

        Returns:
            PageableData: the page doc object
        """
        pass

    @abstractmethod
    def dump_to_file(self, file_path: str):
        """Dump the file

        Args: 
            file_path (str): the file path 
        """
        pass

    @abstractmethod
    def apply(self, proc: Callable, *args, **kwargs):
        """Apply callable method which.

        Args:
            proc (Callable): invoke proc as follows:
                proc(self, *args, **kwargs)

        Returns:
            Any: return the result generated by proc
        """
        pass

    @abstractmethod
    def classify(self) -> SupportedPdfParseMethod:
        """classify the dataset 

        Returns:
            SupportedPdfParseMethod: _description_
        """
        pass

    @abstractmethod
    def clone(self):
        """clone this dataset
        """
        pass


class PymuDocDataset(Dataset):
    def __init__(self, bits: bytes, lang=None):
        """Initialize the dataset, which wraps the pymudoc documents.

        Args:
            bits (bytes): the bytes of the pdf
        """
        self._raw_fitz = fitz.open('pdf', bits)
        self._records = [Doc(v) for v in self._raw_fitz]
        self._data_bits = bits
        self._raw_data = bits

        if lang == '':
            self._lang = None
        elif lang == 'auto':
            from magic_pdf.model.sub_modules.language_detection.utils import auto_detect_lang
            self._lang = auto_detect_lang(bits)
            logger.info(f"lang: {lang}, detect_lang: {self._lang}")
        else:
            self._lang = lang
            logger.info(f"lang: {lang}")
    def __len__(self) -> int:
        """The page number of the pdf."""
        return len(self._records)

    def __iter__(self) -> Iterator[PageableData]:
        """Yield the page doc object."""
        return iter(self._records)

    def supported_methods(self) -> list[SupportedPdfParseMethod]:
        """The method supported by this dataset.

        Returns:
            list[SupportedPdfParseMethod]: the supported methods
        """
        return [SupportedPdfParseMethod.OCR, SupportedPdfParseMethod.TXT]

    def data_bits(self) -> bytes:
        """The pdf bits used to create this dataset."""
        return self._data_bits

    def get_page(self, page_id: int) -> PageableData:
        """The page doc object.

        Args:
            page_id (int): the page doc index

        Returns:
            PageableData: the page doc object
        """
        return self._records[page_id]

    def dump_to_file(self, file_path: str):
        """Dump the file

        Args: 
            file_path (str): the file path 
        """
        
        dir_name = os.path.dirname(file_path)
        if dir_name not in ('', '.', '..'):
            os.makedirs(dir_name, exist_ok=True)
        self._raw_fitz.save(file_path)

    def apply(self, proc: Callable, *args, **kwargs):
        """Apply callable method which.

        Args:
            proc (Callable): invoke proc as follows:
                proc(dataset, *args, **kwargs)

        Returns:
            Any: return the result generated by proc
        """
        if 'lang' in kwargs and self._lang is not None:
            kwargs['lang'] = self._lang
        return proc(self, *args, **kwargs)

    def classify(self) -> SupportedPdfParseMethod:
        """classify the dataset 

        Returns:
            SupportedPdfParseMethod: _description_
        """
        return classify(self._data_bits)

    def clone(self):
        """clone this dataset
        """
        return PymuDocDataset(self._raw_data)


class ImageDataset(Dataset):
    def __init__(self, bits: bytes):
        """Initialize the dataset, which wraps the pymudoc documents.

        Args:
            bits (bytes): the bytes of the photo which will be converted to pdf first. then converted to pymudoc.
        """
        pdf_bytes = fitz.open(stream=bits).convert_to_pdf()
        self._raw_fitz = fitz.open('pdf', pdf_bytes)
        self._records = [Doc(v) for v in self._raw_fitz]
        self._raw_data = bits
        self._data_bits = pdf_bytes

    def __len__(self) -> int:
        """The length of the dataset."""
        return len(self._records)

    def __iter__(self) -> Iterator[PageableData]:
        """Yield the page object."""
        return iter(self._records)

    def supported_methods(self):
        """The method supported by this dataset.

        Returns:
            list[SupportedPdfParseMethod]: the supported methods
        """
        return [SupportedPdfParseMethod.OCR]

    def data_bits(self) -> bytes:
        """The pdf bits used to create this dataset."""
        return self._data_bits

    def get_page(self, page_id: int) -> PageableData:
        """The page doc object.

        Args:
            page_id (int): the page doc index

        Returns:
            PageableData: the page doc object
        """
        return self._records[page_id]

    def dump_to_file(self, file_path: str):
        """Dump the file

        Args: 
            file_path (str): the file path 
        """
        dir_name = os.path.dirname(file_path)
        if dir_name not in ('', '.', '..'):
            os.makedirs(dir_name, exist_ok=True)
        self._raw_fitz.save(file_path)

    def apply(self, proc: Callable, *args, **kwargs):
        """Apply callable method which.

        Args:
            proc (Callable): invoke proc as follows:
                proc(dataset, *args, **kwargs)

        Returns:
            Any: return the result generated by proc
        """
        return proc(self, *args, **kwargs)

    def classify(self) -> SupportedPdfParseMethod:
        """classify the dataset 

        Returns:
            SupportedPdfParseMethod: _description_
        """
        return SupportedPdfParseMethod.OCR

    def clone(self):
        """clone this dataset
        """
        return ImageDataset(self._raw_data)

class Doc(PageableData):
    """Initialized with pymudoc object."""

    def __init__(self, doc: fitz.Page):
        self._doc = doc

    def get_image(self):
        """Return the image info.

        Returns:
            dict: {
                img: np.ndarray,
                width: int,
                height: int
            }
        """
        return fitz_doc_to_image(self._doc)

    def get_doc(self) -> fitz.Page:
        """Get the pymudoc object.

        Returns:
            fitz.Page: the pymudoc object
        """
        return self._doc

    def get_page_info(self) -> PageInfo:
        """Get the page info of the page.

        Returns:
            PageInfo: the page info of this page
        """
        page_w = self._doc.rect.width
        page_h = self._doc.rect.height
        return PageInfo(w=page_w, h=page_h)

    def __getattr__(self, name):
        if hasattr(self._doc, name):
            return getattr(self._doc, name)

    def draw_rect(self, rect_coords, color, fill, fill_opacity, width, overlay):
        """draw rectangle.

        Args:
            rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
            color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
            fill (list[float] | None): fill the board with RGB, None means will not fill with color
            fill_opacity (float): opacity of the fill, range from [0, 1]
            width (float): the width of board
            overlay (bool): fill the color in foreground or background. True means fill in background.
        """
        self._doc.draw_rect(
            rect_coords,
            color=color,
            fill=fill,
            fill_opacity=fill_opacity,
            width=width,
            overlay=overlay,
        )

    def insert_text(self, coord, content, fontsize, color):
        """insert text.

        Args:
            coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
            content (str): the text content
            fontsize (int): font size of the text
            color (list[float] | None):  three element tuple which describe the RGB of the board line, None will use the default font color!
        """
        self._doc.insert_text(coord, content, fontsize=fontsize, color=color)