|
@@ -232,7 +232,7 @@ class PymuDocDataset(Dataset):
|
|
|
self._records[i].set_image(images[i])
|
|
self._records[i].set_image(images[i])
|
|
|
|
|
|
|
|
class ImageDataset(Dataset):
|
|
class ImageDataset(Dataset):
|
|
|
- def __init__(self, bits: bytes):
|
|
|
|
|
|
|
+ def __init__(self, bits: bytes, lang=None):
|
|
|
"""Initialize the dataset, which wraps the pymudoc documents.
|
|
"""Initialize the dataset, which wraps the pymudoc documents.
|
|
|
|
|
|
|
|
Args:
|
|
Args:
|
|
@@ -244,6 +244,17 @@ class ImageDataset(Dataset):
|
|
|
self._raw_data = bits
|
|
self._raw_data = bits
|
|
|
self._data_bits = pdf_bytes
|
|
self._data_bits = pdf_bytes
|
|
|
|
|
|
|
|
|
|
+ if lang == '':
|
|
|
|
|
+ self._lang = None
|
|
|
|
|
+ elif lang == 'auto':
|
|
|
|
|
+ from magic_pdf.model.sub_modules.language_detection.utils import \
|
|
|
|
|
+ auto_detect_lang
|
|
|
|
|
+ self._lang = auto_detect_lang(bits)
|
|
|
|
|
+ logger.info(f'lang: {lang}, detect_lang: {self._lang}')
|
|
|
|
|
+ else:
|
|
|
|
|
+ self._lang = lang
|
|
|
|
|
+ logger.info(f'lang: {lang}')
|
|
|
|
|
+
|
|
|
def __len__(self) -> int:
|
|
def __len__(self) -> int:
|
|
|
"""The length of the dataset."""
|
|
"""The length of the dataset."""
|
|
|
return len(self._records)
|
|
return len(self._records)
|