|
|
@@ -36,7 +36,7 @@ class PageableData(ABC):
|
|
|
|
|
|
Args:
|
|
|
rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
|
|
|
- color (list[float] | None): three element tuple which descript the RGB of the board line, None means no board line
|
|
|
+ color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
|
|
|
fill (list[float] | None): fill the board with RGB, None means will not fill with color
|
|
|
fill_opacity (float): opacity of the fill, range from [0, 1]
|
|
|
width (float): the width of board
|
|
|
@@ -52,7 +52,7 @@ class PageableData(ABC):
|
|
|
coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
|
|
|
content (str): the text content
|
|
|
fontsize (int): font size of the text
|
|
|
- color (list[float] | None): three element tuple which descript the RGB of the board line, None will use the default font color!
|
|
|
+ color (list[float] | None): three element tuple which describe the RGB of the board line, None will use the default font color!
|
|
|
"""
|
|
|
pass
|
|
|
|
|
|
@@ -96,14 +96,39 @@ class Dataset(ABC):
|
|
|
|
|
|
@abstractmethod
|
|
|
def dump_to_file(self, file_path: str):
|
|
|
+ """Dump the file
|
|
|
+
|
|
|
+ Args:
|
|
|
+ file_path (str): the file path
|
|
|
+ """
|
|
|
pass
|
|
|
|
|
|
@abstractmethod
|
|
|
def apply(self, proc: Callable, *args, **kwargs):
|
|
|
+ """Apply callable method which.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ proc (Callable): invoke proc as follows:
|
|
|
+ proc(dataset, *args, **kwargs)
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Any: return the result generated by proc
|
|
|
+ """
|
|
|
pass
|
|
|
|
|
|
@abstractmethod
|
|
|
def classify(self) -> SupportedPdfParseMethod:
|
|
|
+ """classify the dataset
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ SupportedPdfParseMethod: _description_
|
|
|
+ """
|
|
|
+ pass
|
|
|
+
|
|
|
+ @abstractmethod
|
|
|
+ def clone(self):
|
|
|
+ """clone this dataset
|
|
|
+ """
|
|
|
pass
|
|
|
|
|
|
|
|
|
@@ -151,18 +176,42 @@ class PymuDocDataset(Dataset):
|
|
|
return self._records[page_id]
|
|
|
|
|
|
def dump_to_file(self, file_path: str):
|
|
|
+ """Dump the file
|
|
|
+
|
|
|
+ Args:
|
|
|
+ file_path (str): the file path
|
|
|
+ """
|
|
|
+
|
|
|
dir_name = os.path.dirname(file_path)
|
|
|
if dir_name not in ('', '.', '..'):
|
|
|
os.makedirs(dir_name, exist_ok=True)
|
|
|
self._raw_fitz.save(file_path)
|
|
|
|
|
|
def apply(self, proc: Callable, *args, **kwargs):
|
|
|
- new_args = tuple([self] + list(args))
|
|
|
- return proc(*new_args, **kwargs)
|
|
|
+ """Apply callable method which.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ proc (Callable): invoke proc as follows:
|
|
|
+ proc(dataset, *args, **kwargs)
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Any: return the result generated by proc
|
|
|
+ """
|
|
|
+ return proc(self, *args, **kwargs)
|
|
|
|
|
|
def classify(self) -> SupportedPdfParseMethod:
|
|
|
+ """classify the dataset
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ SupportedPdfParseMethod: _description_
|
|
|
+ """
|
|
|
return classify(self._data_bits)
|
|
|
|
|
|
+ def clone(self):
|
|
|
+ """clone this dataset
|
|
|
+ """
|
|
|
+ return PymuDocDataset(self._raw_data)
|
|
|
+
|
|
|
|
|
|
class ImageDataset(Dataset):
|
|
|
def __init__(self, bits: bytes):
|
|
|
@@ -209,17 +258,40 @@ class ImageDataset(Dataset):
|
|
|
return self._records[page_id]
|
|
|
|
|
|
def dump_to_file(self, file_path: str):
|
|
|
+ """Dump the file
|
|
|
+
|
|
|
+ Args:
|
|
|
+ file_path (str): the file path
|
|
|
+ """
|
|
|
dir_name = os.path.dirname(file_path)
|
|
|
if dir_name not in ('', '.', '..'):
|
|
|
os.makedirs(dir_name, exist_ok=True)
|
|
|
self._raw_fitz.save(file_path)
|
|
|
|
|
|
def apply(self, proc: Callable, *args, **kwargs):
|
|
|
+ """Apply callable method which.
|
|
|
+
|
|
|
+ Args:
|
|
|
+ proc (Callable): invoke proc as follows:
|
|
|
+ proc(dataset, *args, **kwargs)
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Any: return the result generated by proc
|
|
|
+ """
|
|
|
return proc(self, *args, **kwargs)
|
|
|
|
|
|
def classify(self) -> SupportedPdfParseMethod:
|
|
|
+ """classify the dataset
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ SupportedPdfParseMethod: _description_
|
|
|
+ """
|
|
|
return SupportedPdfParseMethod.OCR
|
|
|
|
|
|
+ def clone(self):
|
|
|
+ """clone this dataset
|
|
|
+ """
|
|
|
+ return ImageDataset(self._raw_data)
|
|
|
|
|
|
class Doc(PageableData):
|
|
|
"""Initialized with pymudoc object."""
|
|
|
@@ -228,7 +300,7 @@ class Doc(PageableData):
|
|
|
self._doc = doc
|
|
|
|
|
|
def get_image(self):
|
|
|
- """Return the imge info.
|
|
|
+ """Return the image info.
|
|
|
|
|
|
Returns:
|
|
|
dict: {
|
|
|
@@ -266,7 +338,7 @@ class Doc(PageableData):
|
|
|
|
|
|
Args:
|
|
|
rect_coords (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
|
|
|
- color (list[float] | None): three element tuple which descript the RGB of the board line, None means no board line
|
|
|
+ color (list[float] | None): three element tuple which describe the RGB of the board line, None means no board line
|
|
|
fill (list[float] | None): fill the board with RGB, None means will not fill with color
|
|
|
fill_opacity (float): opacity of the fill, range from [0, 1]
|
|
|
width (float): the width of board
|
|
|
@@ -288,6 +360,6 @@ class Doc(PageableData):
|
|
|
coord (list[float]): four elements array contain the top-left and bottom-right coordinates, [x0, y0, x1, y1]
|
|
|
content (str): the text content
|
|
|
fontsize (int): font size of the text
|
|
|
- color (list[float] | None): three element tuple which descript the RGB of the board line, None will use the default font color!
|
|
|
+ color (list[float] | None): three element tuple which describe the RGB of the board line, None will use the default font color!
|
|
|
"""
|
|
|
self._doc.insert_text(coord, content, fontsize=fontsize, color=color)
|