Pārlūkot izejas kodu

fix: image dataset add lang field

icecraft 7 mēneši atpakaļ
vecāks
revīzija
e36a083dc3
1 mainītis faili ar 12 papildinājumiem un 1 dzēšanām
  1. 12 1
      magic_pdf/data/dataset.py

+ 12 - 1
magic_pdf/data/dataset.py

@@ -232,7 +232,7 @@ class PymuDocDataset(Dataset):
             self._records[i].set_image(images[i])
 
 class ImageDataset(Dataset):
-    def __init__(self, bits: bytes):
+    def __init__(self, bits: bytes, lang=None):
         """Initialize the dataset, which wraps the pymudoc documents.
 
         Args:
@@ -244,6 +244,17 @@ class ImageDataset(Dataset):
         self._raw_data = bits
         self._data_bits = pdf_bytes
 
+        if lang == '':
+            self._lang = None
+        elif lang == 'auto':
+            from magic_pdf.model.sub_modules.language_detection.utils import \
+                auto_detect_lang
+            self._lang = auto_detect_lang(bits)
+            logger.info(f'lang: {lang}, detect_lang: {self._lang}')
+        else:
+            self._lang = lang
+            logger.info(f'lang: {lang}')
+
     def __len__(self) -> int:
         """The length of the dataset."""
         return len(self._records)