Преглед изворни кода

support to save page index to file name when input is pdf file (#2868)

* support to save page index to file name when input is pdf file

* OCR pipeline support pdf input

* random name using to save result when there is not input file

* OCR related pipelines support pdf input

* bugfix
Tingquan Gao пре 10 месеци
родитељ
комит
4a42505cfe
33 измењених фајлова са 234 додато и 161 уклоњено
  1. 29 8
      paddlex/inference/common/batch_sampler/image_batch_sampler.py
  2. 0 3
      paddlex/inference/common/result/base_cv_result.py
  3. 15 0
      paddlex/inference/common/result/base_result.py
  4. 17 18
      paddlex/inference/common/result/mixin.py
  5. 3 2
      paddlex/inference/models_new/anomaly_detection/predictor.py
  6. 0 2
      paddlex/inference/models_new/common/static_infer.py
  7. 3 2
      paddlex/inference/models_new/face_feature/predictor.py
  8. 3 2
      paddlex/inference/models_new/formula_recognition/predictor.py
  9. 9 0
      paddlex/inference/models_new/formula_recognition/result.py
  10. 3 2
      paddlex/inference/models_new/image_classification/predictor.py
  11. 1 0
      paddlex/inference/models_new/image_classification/result.py
  12. 3 2
      paddlex/inference/models_new/image_feature/predictor.py
  13. 3 2
      paddlex/inference/models_new/image_multilabel_classification/predictor.py
  14. 3 2
      paddlex/inference/models_new/image_unwarping/predictor.py
  15. 3 2
      paddlex/inference/models_new/instance_segmentation/predictor.py
  16. 31 22
      paddlex/inference/models_new/object_detection/predictor.py
  17. 3 2
      paddlex/inference/models_new/semantic_segmentation/predictor.py
  18. 3 2
      paddlex/inference/models_new/table_structure_recognition/predictor.py
  19. 8 2
      paddlex/inference/models_new/table_structure_recognition/result.py
  20. 3 2
      paddlex/inference/models_new/text_detection/predictor.py
  21. 9 2
      paddlex/inference/models_new/text_detection/result.py
  22. 3 2
      paddlex/inference/models_new/text_recognition/predictor.py
  23. 4 2
      paddlex/inference/pipelines_new/attribute_recognition/pipeline.py
  24. 3 8
      paddlex/inference/pipelines_new/doc_preprocessor/pipeline.py
  25. 3 8
      paddlex/inference/pipelines_new/formula_recognition/pipeline.py
  26. 3 8
      paddlex/inference/pipelines_new/layout_parsing/pipeline.py
  27. 3 8
      paddlex/inference/pipelines_new/layout_parsing/pipeline_v2.py
  28. 3 8
      paddlex/inference/pipelines_new/ocr/pipeline.py
  29. 15 2
      paddlex/inference/pipelines_new/ocr/result.py
  30. 24 18
      paddlex/inference/pipelines_new/seal_recognition/pipeline.py
  31. 3 8
      paddlex/inference/pipelines_new/table_recognition/pipeline.py
  32. 3 8
      paddlex/inference/pipelines_new/table_recognition/pipeline_v2.py
  33. 15 2
      paddlex/inference/pipelines_new/table_recognition/result.py

+ 29 - 8
paddlex/inference/common/batch_sampler/image_batch_sampler.py

@@ -24,6 +24,26 @@ from ...utils.io import PDFReader
 from .base_batch_sampler import BaseBatchSampler
 from .base_batch_sampler import BaseBatchSampler
 
 
 
 
+class ImgInstance:
+    def __init__(self):
+        self.instances = []
+        self.input_paths = []
+        self.page_indexes = []
+
+    def append(self, instance, input_path, page_index):
+        self.instances.append(instance)
+        self.input_paths.append(input_path)
+        self.page_indexes.append(page_index)
+
+    def reset(self):
+        self.instances = []
+        self.input_paths = []
+        self.page_indexes = []
+
+    def __len__(self):
+        return len(self.instances)
+
+
 class ImageBatchSampler(BaseBatchSampler):
 class ImageBatchSampler(BaseBatchSampler):
 
 
     SUFFIX = ["jpg", "png", "jpeg", "JPEG", "JPG", "bmp"]
     SUFFIX = ["jpg", "png", "jpeg", "JPEG", "JPG", "bmp"]
@@ -60,24 +80,25 @@ class ImageBatchSampler(BaseBatchSampler):
         if not isinstance(inputs, list):
         if not isinstance(inputs, list):
             inputs = [inputs]
             inputs = [inputs]
 
 
-        batch = []
+        batch = {"instances": [], "input_paths": [], "page_indexes": []}
+        batch = ImgInstance()
         for input in inputs:
         for input in inputs:
             if isinstance(input, np.ndarray):
             if isinstance(input, np.ndarray):
-                batch.append(input)
+                batch.append(input, None, None)
                 if len(batch) == self.batch_size:
                 if len(batch) == self.batch_size:
                     yield batch
                     yield batch
-                    batch = []
+                    batch.reset()
             elif isinstance(input, str) and input.split(".")[-1] in ("PDF", "pdf"):
             elif isinstance(input, str) and input.split(".")[-1] in ("PDF", "pdf"):
                 file_path = (
                 file_path = (
                     self._download_from_url(input)
                     self._download_from_url(input)
                     if input.startswith("http")
                     if input.startswith("http")
                     else input
                     else input
                 )
                 )
-                for page_img in self.pdf_reader.read(file_path):
-                    batch.append(page_img)
+                for page_idx, page_img in enumerate(self.pdf_reader.read(file_path)):
+                    batch.append(page_img, file_path, page_idx)
                     if len(batch) == self.batch_size:
                     if len(batch) == self.batch_size:
                         yield batch
                         yield batch
-                        batch = []
+                        batch.reset()
             elif isinstance(input, str):
             elif isinstance(input, str):
                 file_path = (
                 file_path = (
                     self._download_from_url(input)
                     self._download_from_url(input)
@@ -86,10 +107,10 @@ class ImageBatchSampler(BaseBatchSampler):
                 )
                 )
                 file_list = self._get_files_list(file_path)
                 file_list = self._get_files_list(file_path)
                 for file_path in file_list:
                 for file_path in file_list:
-                    batch.append(file_path)
+                    batch.append(file_path, file_path, None)
                     if len(batch) == self.batch_size:
                     if len(batch) == self.batch_size:
                         yield batch
                         yield batch
-                        batch = []
+                        batch.reset()
             else:
             else:
                 logging.warning(
                 logging.warning(
                     f"Not supported input data type! Only `numpy.ndarray` and `str` are supported! So has been ignored: {input}."
                     f"Not supported input data type! Only `numpy.ndarray` and `str` are supported! So has been ignored: {input}."

+ 0 - 3
paddlex/inference/common/result/base_cv_result.py

@@ -26,9 +26,6 @@ class BaseCVResult(BaseResult, ImgMixin):
 
 
         Args:
         Args:
             data (dict): The initial data.
             data (dict): The initial data.
-
-        Raises:
-            AssertionError: If the required key (`BaseCVResult.INPUT_IMG_KEY`) are not found in the data.
         """
         """
         super().__init__(data)
         super().__init__(data)
         ImgMixin.__init__(self, "pillow")
         ImgMixin.__init__(self, "pillow")

+ 15 - 0
paddlex/inference/common/result/base_result.py

@@ -13,6 +13,11 @@
 # limitations under the License.
 # limitations under the License.
 
 
 import inspect
 import inspect
+from pathlib import Path
+import time
+import random
+
+from ....utils import logging
 from .mixin import StrMixin, JsonMixin, ImgMixin
 from .mixin import StrMixin, JsonMixin, ImgMixin
 
 
 
 
@@ -45,3 +50,13 @@ class BaseResult(dict, JsonMixin, StrMixin):
                 func(save_path=save_path)
                 func(save_path=save_path)
             else:
             else:
                 func()
                 func()
+
+    def _get_input_fn(self):
+        if (fp := self["input_path"]) is None:
+            timestamp = int(time.time())
+            random_number = random.randint(1000, 9999)
+            fp = f"{timestamp}_{random_number}"
+            logging.warning(
+                f"There is not input file name as reference for name of saved result file. So the saved result file would be named with timestamp and random number: `{fp}`."
+            )
+        return Path(fp).name

+ 17 - 18
paddlex/inference/common/result/mixin.py

@@ -144,9 +144,8 @@ class JsonMixin:
             return mime_type is not None and mime_type == "application/json"
             return mime_type is not None and mime_type == "application/json"
 
 
         if not _is_json_file(save_path):
         if not _is_json_file(save_path):
-            fp = Path(self["input_path"])
-            stem = fp.stem
-            suffix = fp.suffix
+            fn = Path(self._get_input_fn())
+            stem = fn.stem
             base_save_path = Path(save_path)
             base_save_path = Path(save_path)
             for key in self.json:
             for key in self.json:
                 save_path = base_save_path / f"{stem}_{key}.json"
                 save_path = base_save_path / f"{stem}_{key}.json"
@@ -247,9 +246,8 @@ class Base64Mixin:
             **kwargs: Additional keyword arguments that will be passed to the base64 writer.
             **kwargs: Additional keyword arguments that will be passed to the base64 writer.
         """
         """
         if not str(save_path).lower().endswith((".b64")):
         if not str(save_path).lower().endswith((".b64")):
-            fp = Path(self["input_path"])
-            stem = fp.stem
-            suffix = fp.suffix
+            fn = Path(self._get_input_fn())
+            stem = fn.stem
             base_save_path = Path(save_path)
             base_save_path = Path(save_path)
             for key in self.base64:
             for key in self.base64:
                 save_path = base_save_path / f"{stem}_{key}.b64"
                 save_path = base_save_path / f"{stem}_{key}.b64"
@@ -312,9 +310,9 @@ class ImgMixin:
             return mime_type is not None and mime_type.startswith("image/")
             return mime_type is not None and mime_type.startswith("image/")
 
 
         if not _is_image_file(save_path):
         if not _is_image_file(save_path):
-            fp = Path(self["input_path"])
-            stem = fp.stem
-            suffix = fp.suffix
+            fn = Path(self._get_input_fn())
+            suffix = fn.suffix if _is_image_file(fn) else ".png"
+            stem = fn.stem
             base_save_path = Path(save_path)
             base_save_path = Path(save_path)
             for key in self.img:
             for key in self.img:
                 save_path = base_save_path / f"{stem}_{key}{suffix}"
                 save_path = base_save_path / f"{stem}_{key}{suffix}"
@@ -380,8 +378,9 @@ class CSVMixin:
             return mime_type is not None and mime_type == "text/csv"
             return mime_type is not None and mime_type == "text/csv"
 
 
         if not _is_csv_file(save_path):
         if not _is_csv_file(save_path):
-            fp = Path(self["input_path"])
-            stem = fp.stem
+            fn = Path(self._get_input_fn())
+            fn = Path(self._get_input_fn())
+            stem = fn.stem
             base_save_path = Path(save_path)
             base_save_path = Path(save_path)
             for key in self.csv:
             for key in self.csv:
                 save_path = base_save_path / f"{stem}_{key}.csv"
                 save_path = base_save_path / f"{stem}_{key}.csv"
@@ -444,8 +443,8 @@ class HtmlMixin:
             return mime_type is not None and mime_type == "text/html"
             return mime_type is not None and mime_type == "text/html"
 
 
         if not _is_html_file(save_path):
         if not _is_html_file(save_path):
-            fp = Path(self["input_path"])
-            stem = fp.stem
+            fn = Path(self._get_input_fn())
+            stem = fn.stem
             base_save_path = Path(save_path)
             base_save_path = Path(save_path)
             for key in self.html:
             for key in self.html:
                 save_path = base_save_path / f"{stem}_{key}.html"
                 save_path = base_save_path / f"{stem}_{key}.html"
@@ -512,8 +511,8 @@ class XlsxMixin:
             )
             )
 
 
         if not _is_xlsx_file(save_path):
         if not _is_xlsx_file(save_path):
-            fp = Path(self["input_path"])
-            stem = fp.stem
+            fn = Path(self._get_input_fn())
+            stem = fn.stem
             base_save_path = Path(save_path)
             base_save_path = Path(save_path)
             for key in self.xlsx:
             for key in self.xlsx:
                 save_path = base_save_path / f"{stem}_{key}.xlsx"
                 save_path = base_save_path / f"{stem}_{key}.xlsx"
@@ -578,9 +577,9 @@ class VideoMixin:
         video_writer = VideoWriter(backend=self._backend, *args, **kwargs)
         video_writer = VideoWriter(backend=self._backend, *args, **kwargs)
 
 
         if not _is_video_file(save_path):
         if not _is_video_file(save_path):
-            fp = Path(self["input_path"])
-            stem = fp.stem
-            suffix = fp.suffix
+            fn = Path(self._get_input_fn())
+            stem = fn.stem
+            suffix = fn.suffix if _is_video_file(fn) else ".mp4"
             base_save_path = Path(save_path)
             base_save_path = Path(save_path)
             for key in self.video:
             for key in self.video:
                 save_path = base_save_path / f"{stem}_{key}{suffix}"
                 save_path = base_save_path / f"{stem}_{key}{suffix}"

+ 3 - 2
paddlex/inference/models_new/anomaly_detection/predictor.py

@@ -100,7 +100,7 @@ class UadPredictor(BasicPredictor):
         Returns:
         Returns:
             dict: A dictionary containing the input path, raw image, and predicted segmentation maps for every instance of the batch. Keys include 'input_path', 'input_img', and 'pred'.
             dict: A dictionary containing the input path, raw image, and predicted segmentation maps for every instance of the batch. Keys include 'input_path', 'input_img', and 'pred'.
         """
         """
-        batch_raw_imgs = self.preprocessors["Read"](imgs=batch_data)
+        batch_raw_imgs = self.preprocessors["Read"](imgs=batch_data.instances)
         batch_imgs = self.preprocessors["Resize"](imgs=batch_raw_imgs)
         batch_imgs = self.preprocessors["Resize"](imgs=batch_raw_imgs)
         batch_imgs = self.preprocessors["Normalize"](imgs=batch_imgs)
         batch_imgs = self.preprocessors["Normalize"](imgs=batch_imgs)
         batch_imgs = self.preprocessors["ToCHW"](imgs=batch_imgs)
         batch_imgs = self.preprocessors["ToCHW"](imgs=batch_imgs)
@@ -111,7 +111,8 @@ class UadPredictor(BasicPredictor):
             batch_preds = np.split(batch_preds[0], len(batch_data), axis=0)
             batch_preds = np.split(batch_preds[0], len(batch_data), axis=0)
 
 
         return {
         return {
-            "input_path": batch_data,
+            "input_path": batch_data.input_paths,
+            "page_index": batch_data.page_indexes,
             "input_img": batch_raw_imgs,
             "input_img": batch_raw_imgs,
             "pred": batch_preds,
             "pred": batch_preds,
         }
         }

+ 0 - 2
paddlex/inference/models_new/common/static_infer.py

@@ -110,8 +110,6 @@ class StaticInfer:
             self._update_option(option)
             self._update_option(option)
 
 
     def _reset(self) -> None:
     def _reset(self) -> None:
-        if not self.option:
-            self.option = PaddlePredictorOption()
         logging.debug(f"Env: {self.option}")
         logging.debug(f"Env: {self.option}")
         (
         (
             predictor,
             predictor,

+ 3 - 2
paddlex/inference/models_new/face_feature/predictor.py

@@ -45,7 +45,7 @@ class FaceFeaturePredictor(ImageFeaturePredictor):
         Returns:
         Returns:
             dict: A dictionary containing the input path, raw image, class IDs, scores, and label names for every instance of the batch. Keys include 'input_path', 'input_img', 'class_ids', 'scores', and 'label_names'.
             dict: A dictionary containing the input path, raw image, class IDs, scores, and label names for every instance of the batch. Keys include 'input_path', 'input_img', 'class_ids', 'scores', and 'label_names'.
         """
         """
-        batch_raw_imgs = self.preprocessors["Read"](imgs=batch_data)
+        batch_raw_imgs = self.preprocessors["Read"](imgs=batch_data.instances)
         batch_imgs = self.preprocessors["Resize"](imgs=batch_raw_imgs)
         batch_imgs = self.preprocessors["Resize"](imgs=batch_raw_imgs)
         batch_imgs = self.preprocessors["Normalize"](imgs=batch_imgs)
         batch_imgs = self.preprocessors["Normalize"](imgs=batch_imgs)
         batch_imgs = self.preprocessors["ToCHW"](imgs=batch_imgs)
         batch_imgs = self.preprocessors["ToCHW"](imgs=batch_imgs)
@@ -58,7 +58,8 @@ class FaceFeaturePredictor(ImageFeaturePredictor):
         features = self.postprocessors["NormalizeFeatures"](batch_preds)
         features = self.postprocessors["NormalizeFeatures"](batch_preds)
 
 
         return {
         return {
-            "input_path": batch_data,
+            "input_path": batch_data.input_paths,
+            "page_index": batch_data.page_indexes,
             "input_img": batch_raw_imgs,
             "input_img": batch_raw_imgs,
             "feature": features,
             "feature": features,
         }
         }

+ 3 - 2
paddlex/inference/models_new/formula_recognition/predictor.py

@@ -85,7 +85,7 @@ class FormulaRecPredictor(BasicPredictor):
         return pre_tfs, infer, post_op
         return pre_tfs, infer, post_op
 
 
     def process(self, batch_data):
     def process(self, batch_data):
-        batch_raw_imgs = self.pre_tfs["Read"](imgs=batch_data)
+        batch_raw_imgs = self.pre_tfs["Read"](imgs=batch_data.instances)
         if self.model_name in ("LaTeX_OCR_rec"):
         if self.model_name in ("LaTeX_OCR_rec"):
             batch_imgs = self.pre_tfs["MinMaxResize"](imgs=batch_raw_imgs)
             batch_imgs = self.pre_tfs["MinMaxResize"](imgs=batch_raw_imgs)
             batch_imgs = self.pre_tfs["LatexTestTransform"](imgs=batch_imgs)
             batch_imgs = self.pre_tfs["LatexTestTransform"](imgs=batch_imgs)
@@ -105,7 +105,8 @@ class FormulaRecPredictor(BasicPredictor):
         batch_preds = [p.reshape([-1]) for p in batch_preds[0]]
         batch_preds = [p.reshape([-1]) for p in batch_preds[0]]
         rec_formula = self.post_op(batch_preds)
         rec_formula = self.post_op(batch_preds)
         return {
         return {
-            "input_path": batch_data,
+            "input_path": batch_data.input_paths,
+            "page_index": batch_data.page_indexes,
             "input_img": batch_raw_imgs,
             "input_img": batch_raw_imgs,
             "rec_formula": rec_formula,
             "rec_formula": rec_formula,
         }
         }

+ 9 - 0
paddlex/inference/models_new/formula_recognition/result.py

@@ -33,6 +33,15 @@ from ....utils.file_interface import custom_open
 
 
 
 
 class FormulaRecResult(BaseCVResult):
 class FormulaRecResult(BaseCVResult):
+    def _get_input_fn(self):
+        fn = super()._get_input_fn()
+        if (page_idx := self["page_index"]) is not None:
+            fp = Path(fn)
+            stem, suffix = fp.stem, fp.suffix
+            return f"{stem}_{page_idx}{suffix}"
+        else:
+            return fn
+
     def _to_str(self, *args, **kwargs):
     def _to_str(self, *args, **kwargs):
         data = copy.deepcopy(self)
         data = copy.deepcopy(self)
         data.pop("input_img")
         data.pop("input_img")

+ 3 - 2
paddlex/inference/models_new/image_classification/predictor.py

@@ -112,7 +112,7 @@ class ClasPredictor(BasicPredictor):
         Returns:
         Returns:
             dict: A dictionary containing the input path, raw image, class IDs, scores, and label names for every instance of the batch. Keys include 'input_path', 'input_img', 'class_ids', 'scores', and 'label_names'.
             dict: A dictionary containing the input path, raw image, class IDs, scores, and label names for every instance of the batch. Keys include 'input_path', 'input_img', 'class_ids', 'scores', and 'label_names'.
         """
         """
-        batch_raw_imgs = self.preprocessors["Read"](imgs=batch_data)
+        batch_raw_imgs = self.preprocessors["Read"](imgs=batch_data.instances)
         batch_imgs = self.preprocessors["Resize"](imgs=batch_raw_imgs)
         batch_imgs = self.preprocessors["Resize"](imgs=batch_raw_imgs)
         if "Crop" in self.preprocessors:
         if "Crop" in self.preprocessors:
             batch_imgs = self.preprocessors["Crop"](imgs=batch_imgs)
             batch_imgs = self.preprocessors["Crop"](imgs=batch_imgs)
@@ -124,7 +124,8 @@ class ClasPredictor(BasicPredictor):
             batch_preds, topk=topk or self.topk
             batch_preds, topk=topk or self.topk
         )
         )
         return {
         return {
-            "input_path": batch_data,
+            "input_path": batch_data.input_paths,
+            "page_index": batch_data.page_indexes,
             "input_img": batch_raw_imgs,
             "input_img": batch_raw_imgs,
             "class_ids": batch_class_ids,
             "class_ids": batch_class_ids,
             "scores": batch_scores,
             "scores": batch_scores,

+ 1 - 0
paddlex/inference/models_new/image_classification/result.py

@@ -24,6 +24,7 @@ from ...common.result import BaseCVResult, StrMixin, JsonMixin
 
 
 
 
 class TopkResult(BaseCVResult):
 class TopkResult(BaseCVResult):
+
     def _to_str(self, *args, **kwargs):
     def _to_str(self, *args, **kwargs):
         data = copy.deepcopy(self)
         data = copy.deepcopy(self)
         data.pop("input_img")
         data.pop("input_img")

+ 3 - 2
paddlex/inference/models_new/image_feature/predictor.py

@@ -107,7 +107,7 @@ class ImageFeaturePredictor(BasicPredictor):
         Returns:
         Returns:
             dict: A dictionary containing the input path, raw image, class IDs, scores, and label names for every instance of the batch. Keys include 'input_path', 'input_img', 'class_ids', 'scores', and 'label_names'.
             dict: A dictionary containing the input path, raw image, class IDs, scores, and label names for every instance of the batch. Keys include 'input_path', 'input_img', 'class_ids', 'scores', and 'label_names'.
         """
         """
-        batch_raw_imgs = self.preprocessors["Read"](imgs=batch_data)
+        batch_raw_imgs = self.preprocessors["Read"](imgs=batch_data.instances)
         batch_imgs = self.preprocessors["Resize"](imgs=batch_raw_imgs)
         batch_imgs = self.preprocessors["Resize"](imgs=batch_raw_imgs)
         batch_imgs = self.preprocessors["Normalize"](imgs=batch_imgs)
         batch_imgs = self.preprocessors["Normalize"](imgs=batch_imgs)
         batch_imgs = self.preprocessors["ToCHW"](imgs=batch_imgs)
         batch_imgs = self.preprocessors["ToCHW"](imgs=batch_imgs)
@@ -115,7 +115,8 @@ class ImageFeaturePredictor(BasicPredictor):
         batch_preds = self.infer(x=x)
         batch_preds = self.infer(x=x)
         features = self.postprocessors["NormalizeFeatures"](batch_preds)
         features = self.postprocessors["NormalizeFeatures"](batch_preds)
         return {
         return {
-            "input_path": batch_data,
+            "input_path": batch_data.input_paths,
+            "page_index": batch_data.page_indexes,
             "input_img": batch_raw_imgs,
             "input_img": batch_raw_imgs,
             "feature": features,
             "feature": features,
         }
         }

+ 3 - 2
paddlex/inference/models_new/image_multilabel_classification/predictor.py

@@ -66,7 +66,7 @@ class MLClasPredictor(ClasPredictor):
         Returns:
         Returns:
             dict: A dictionary containing the input path, raw image, class IDs, scores, and label names for every instance of the batch. Keys include 'input_path', 'input_img', 'class_ids', 'scores', and 'label_names'.
             dict: A dictionary containing the input path, raw image, class IDs, scores, and label names for every instance of the batch. Keys include 'input_path', 'input_img', 'class_ids', 'scores', and 'label_names'.
         """
         """
-        batch_raw_imgs = self.preprocessors["Read"](imgs=batch_data)
+        batch_raw_imgs = self.preprocessors["Read"](imgs=batch_data.instances)
         batch_imgs = self.preprocessors["Resize"](imgs=batch_raw_imgs)
         batch_imgs = self.preprocessors["Resize"](imgs=batch_raw_imgs)
         batch_imgs = self.preprocessors["Normalize"](imgs=batch_imgs)
         batch_imgs = self.preprocessors["Normalize"](imgs=batch_imgs)
         batch_imgs = self.preprocessors["ToCHW"](imgs=batch_imgs)
         batch_imgs = self.preprocessors["ToCHW"](imgs=batch_imgs)
@@ -79,7 +79,8 @@ class MLClasPredictor(ClasPredictor):
             threshold=self.threshold if threshold is None else threshold,
             threshold=self.threshold if threshold is None else threshold,
         )
         )
         return {
         return {
-            "input_path": batch_data,
+            "input_path": batch_data.input_paths,
+            "page_index": batch_data.page_indexes,
             "input_img": batch_raw_imgs,
             "input_img": batch_raw_imgs,
             "class_ids": batch_class_ids,
             "class_ids": batch_class_ids,
             "scores": batch_scores,
             "scores": batch_scores,

+ 3 - 2
paddlex/inference/models_new/image_unwarping/predictor.py

@@ -90,7 +90,7 @@ class WarpPredictor(BasicPredictor):
         Returns:
         Returns:
             dict: A dictionary containing the input path, raw image, class IDs, scores, and label names for every instance of the batch. Keys include 'input_path', 'input_img', 'class_ids', 'scores', and 'label_names'.
             dict: A dictionary containing the input path, raw image, class IDs, scores, and label names for every instance of the batch. Keys include 'input_path', 'input_img', 'class_ids', 'scores', and 'label_names'.
         """
         """
-        batch_raw_imgs = self.preprocessors["Read"](imgs=batch_data)
+        batch_raw_imgs = self.preprocessors["Read"](imgs=batch_data.instances)
         batch_imgs = self.preprocessors["Normalize"](imgs=batch_raw_imgs)
         batch_imgs = self.preprocessors["Normalize"](imgs=batch_raw_imgs)
         batch_imgs = self.preprocessors["ToCHW"](imgs=batch_imgs)
         batch_imgs = self.preprocessors["ToCHW"](imgs=batch_imgs)
         x = self.preprocessors["ToBatch"](imgs=batch_imgs)
         x = self.preprocessors["ToBatch"](imgs=batch_imgs)
@@ -98,7 +98,8 @@ class WarpPredictor(BasicPredictor):
         batch_warp_preds = self.postprocessors["DocTrPostProcess"](batch_preds)
         batch_warp_preds = self.postprocessors["DocTrPostProcess"](batch_preds)
 
 
         return {
         return {
-            "input_path": batch_data,
+            "input_path": batch_data.input_paths,
+            "page_index": batch_data.page_indexes,
             "input_img": batch_raw_imgs,
             "input_img": batch_raw_imgs,
             "doctr_img": batch_warp_preds,
             "doctr_img": batch_warp_preds,
         }
         }

+ 3 - 2
paddlex/inference/models_new/instance_segmentation/predictor.py

@@ -117,7 +117,7 @@ class InstanceSegPredictor(DetPredictor):
             dict: A dictionary containing the input path, raw image, box and mask
             dict: A dictionary containing the input path, raw image, box and mask
                 for every instance of the batch. Keys include 'input_path', 'input_img', 'boxes' and 'masks'.
                 for every instance of the batch. Keys include 'input_path', 'input_img', 'boxes' and 'masks'.
         """
         """
-        datas = batch_data
+        datas = batch_data.instances
         # preprocess
         # preprocess
         for pre_op in self.pre_ops[:-1]:
         for pre_op in self.pre_ops[:-1]:
             datas = pre_op(datas)
             datas = pre_op(datas)
@@ -146,7 +146,8 @@ class InstanceSegPredictor(DetPredictor):
         )
         )
 
 
         return {
         return {
-            "input_path": [data.get("img_path", None) for data in datas],
+            "input_path": batch_data.input_paths,
+            "page_index": batch_data.page_indexes,
             "input_img": [data["ori_img"] for data in datas],
             "input_img": [data["ori_img"] for data in datas],
             "boxes": [result["boxes"] for result in boxes_masks],
             "boxes": [result["boxes"] for result in boxes_masks],
             "masks": [result["masks"] for result in boxes_masks],
             "masks": [result["masks"] for result in boxes_masks],

+ 31 - 22
paddlex/inference/models_new/object_detection/predictor.py

@@ -83,20 +83,26 @@ class DetPredictor(BasicPredictor):
                 raise ValueError(
                 raise ValueError(
                     f"The type of `img_size` must be int or Tuple[int, int], but got {type(img_size)}."
                     f"The type of `img_size` must be int or Tuple[int, int], but got {type(img_size)}."
                 )
                 )
-        
+
         if layout_unclip_ratio is not None:
         if layout_unclip_ratio is not None:
             if isinstance(layout_unclip_ratio, float):
             if isinstance(layout_unclip_ratio, float):
                 layout_unclip_ratio = (layout_unclip_ratio, layout_unclip_ratio)
                 layout_unclip_ratio = (layout_unclip_ratio, layout_unclip_ratio)
             elif isinstance(layout_unclip_ratio, (tuple, list)):
             elif isinstance(layout_unclip_ratio, (tuple, list)):
-                assert len(layout_unclip_ratio) == 2, f"The length of `layout_unclip_ratio` should be 2."
+                assert (
+                    len(layout_unclip_ratio) == 2
+                ), f"The length of `layout_unclip_ratio` should be 2."
             else:
             else:
                 raise ValueError(
                 raise ValueError(
                     f"The type of `layout_unclip_ratio` must be float or Tuple[float, float], but got {type(layout_unclip_ratio)}."
                     f"The type of `layout_unclip_ratio` must be float or Tuple[float, float], but got {type(layout_unclip_ratio)}."
                 )
                 )
-        
+
         if layout_merge_bboxes_mode is not None:
         if layout_merge_bboxes_mode is not None:
-            assert layout_merge_bboxes_mode in ["union", "large", "small"], \
-                f"The value of `layout_merge_bboxes_mode` must be one of ['union', 'large', 'small'], but got {layout_merge_bboxes_mode}"
+            assert layout_merge_bboxes_mode in [
+                "union",
+                "large",
+                "small",
+            ], f"The value of `layout_merge_bboxes_mode` must be one of ['union', 'large', 'small'], but got {layout_merge_bboxes_mode}"
+
         self.img_size = img_size
         self.img_size = img_size
         self.threshold = threshold
         self.threshold = threshold
         self.layout_nms = layout_nms
         self.layout_nms = layout_nms
@@ -197,13 +203,14 @@ class DetPredictor(BasicPredictor):
         else:
         else:
             return [{"boxes": np.array(res)} for res in pred_box]
             return [{"boxes": np.array(res)} for res in pred_box]
 
 
-    def process(self, 
-            batch_data: List[Any], 
-            threshold: Optional[Union[float, dict]] = None,
-            layout_nms: Optional[bool] = None,
-            layout_unclip_ratio: Optional[Union[float, Tuple[float, float]]] = None,
-            layout_merge_bboxes_mode: Optional[str] = None,
-        ):
+    def process(
+        self,
+        batch_data: List[Any],
+        threshold: Optional[Union[float, dict]] = None,
+        layout_nms: bool = False,
+        layout_unclip_ratio: Optional[Union[float, Tuple[float, float]]] = None,
+        layout_merge_bboxes_mode: Optional[str] = None,
+    ):
         """
         """
         Process a batch of data through the preprocessing, inference, and postprocessing.
         Process a batch of data through the preprocessing, inference, and postprocessing.
 
 
@@ -218,7 +225,7 @@ class DetPredictor(BasicPredictor):
             dict: A dictionary containing the input path, raw image, class IDs, scores, and label names
             dict: A dictionary containing the input path, raw image, class IDs, scores, and label names
                 for every instance of the batch. Keys include 'input_path', 'input_img', 'class_ids', 'scores', and 'label_names'.
                 for every instance of the batch. Keys include 'input_path', 'input_img', 'class_ids', 'scores', and 'label_names'.
         """
         """
-        datas = batch_data
+        datas = batch_data.instances
         # preprocess
         # preprocess
         for pre_op in self.pre_ops[:-1]:
         for pre_op in self.pre_ops[:-1]:
             datas = pre_op(datas)
             datas = pre_op(datas)
@@ -233,16 +240,18 @@ class DetPredictor(BasicPredictor):
         preds_list = self._format_output(batch_preds)
         preds_list = self._format_output(batch_preds)
         # postprocess
         # postprocess
         boxes = self.post_op(
         boxes = self.post_op(
-            preds_list, 
-            datas, 
-            threshold = threshold or self.threshold,
+            preds_list,
+            datas,
+            threshold=threshold or self.threshold,
             layout_nms=layout_nms or self.layout_nms,
             layout_nms=layout_nms or self.layout_nms,
             layout_unclip_ratio=layout_unclip_ratio or self.layout_unclip_ratio,
             layout_unclip_ratio=layout_unclip_ratio or self.layout_unclip_ratio,
-            layout_merge_bboxes_mode=layout_merge_bboxes_mode or self.layout_merge_bboxes_mode
+            layout_merge_bboxes_mode=layout_merge_bboxes_mode
+            or self.layout_merge_bboxes_mode,
         )
         )
 
 
         return {
         return {
-            "input_path": [data.get("img_path", None) for data in datas],
+            "input_path": batch_data.input_paths,
+            "page_index": batch_data.page_indexes,
             "input_img": [data["ori_img"] for data in datas],
             "input_img": [data["ori_img"] for data in datas],
             "boxes": boxes,
             "boxes": boxes,
         }
         }
@@ -330,7 +339,7 @@ class DetPredictor(BasicPredictor):
         if self.layout_unclip_ratio is None:
         if self.layout_unclip_ratio is None:
             self.layout_unclip_ratio = self.config.get("layout_unclip_ratio", None)
             self.layout_unclip_ratio = self.config.get("layout_unclip_ratio", None)
         if self.layout_merge_bboxes_mode is None:
         if self.layout_merge_bboxes_mode is None:
-            self.layout_merge_bboxes_mode = self.config.get("layout_merge_bboxes_mode", None)
-        return DetPostProcess(
-            labels=self.config["label_list"]
-        )
+            self.layout_merge_bboxes_mode = self.config.get(
+                "layout_merge_bboxes_mode", None
+            )
+        return DetPostProcess(labels=self.config["label_list"])

+ 3 - 2
paddlex/inference/models_new/semantic_segmentation/predictor.py

@@ -120,7 +120,7 @@ class SegPredictor(BasicPredictor):
         Returns:
         Returns:
             dict: A dictionary containing the input path, raw image, and predicted segmentation maps for every instance of the batch. Keys include 'input_path', 'input_img', and 'pred'.
             dict: A dictionary containing the input path, raw image, and predicted segmentation maps for every instance of the batch. Keys include 'input_path', 'input_img', and 'pred'.
         """
         """
-        batch_raw_imgs = self.preprocessors["Read"](imgs=batch_data)
+        batch_raw_imgs = self.preprocessors["Read"](imgs=batch_data.instances)
         batch_imgs = self.preprocessors["Resize"](
         batch_imgs = self.preprocessors["Resize"](
             imgs=batch_raw_imgs, target_size=target_size
             imgs=batch_raw_imgs, target_size=target_size
         )
         )
@@ -135,7 +135,8 @@ class SegPredictor(BasicPredictor):
         batch_preds = self.postprocessers(batch_preds, batch_raw_imgs)
         batch_preds = self.postprocessers(batch_preds, batch_raw_imgs)
 
 
         return {
         return {
-            "input_path": batch_data,
+            "input_path": batch_data.input_paths,
+            "page_index": batch_data.page_indexes,
             "input_img": batch_raw_imgs,
             "input_img": batch_raw_imgs,
             "pred": batch_preds,
             "pred": batch_preds,
         }
         }

+ 3 - 2
paddlex/inference/models_new/table_structure_recognition/predictor.py

@@ -84,7 +84,7 @@ class TablePredictor(BasicPredictor):
         Returns:
         Returns:
             dict: A dictionary containing the input path, raw image, class IDs, scores, and label names for every instance of the batch. Keys include 'input_path', 'input_img', 'class_ids', 'scores', and 'label_names'.
             dict: A dictionary containing the input path, raw image, class IDs, scores, and label names for every instance of the batch. Keys include 'input_path', 'input_img', 'class_ids', 'scores', and 'label_names'.
         """
         """
-        batch_raw_imgs = self.preprocessors[0](imgs=batch_data)  # ReadImage
+        batch_raw_imgs = self.preprocessors[0](imgs=batch_data.instances)  # ReadImage
         ori_shapes = []
         ori_shapes = []
         for s in range(len(batch_raw_imgs)):
         for s in range(len(batch_raw_imgs)):
             ori_shapes.append([batch_raw_imgs[s].shape[1], batch_raw_imgs[s].shape[0]])
             ori_shapes.append([batch_raw_imgs[s].shape[1], batch_raw_imgs[s].shape[0]])
@@ -116,7 +116,8 @@ class TablePredictor(BasicPredictor):
             table_result_structure_score.append(table_result[i]["structure_score"])
             table_result_structure_score.append(table_result[i]["structure_score"])
 
 
         final_result = {
         final_result = {
-            "input_path": batch_data,
+            "input_path": batch_data.input_paths,
+            "page_index": batch_data.page_indexes,
             "input_img": batch_raw_imgs,
             "input_img": batch_raw_imgs,
             "bbox": table_result_bbox,
             "bbox": table_result_bbox,
             "structure": table_result_structure,
             "structure": table_result_structure,

+ 8 - 2
paddlex/inference/models_new/table_structure_recognition/result.py

@@ -25,8 +25,14 @@ from ...common.result import BaseCVResult, StrMixin, JsonMixin
 class TableRecResult(BaseCVResult):
 class TableRecResult(BaseCVResult):
     """SaveTableResults"""
     """SaveTableResults"""
 
 
-    def __init__(self, data):
-        super().__init__(data)
+    def _get_input_fn(self):
+        fn = super()._get_input_fn()
+        if (page_idx := self["page_index"]) is not None:
+            fp = Path(fn)
+            stem, suffix = fp.stem, fp.suffix
+            return f"{stem}_{page_idx}{suffix}"
+        else:
+            return fn
 
 
     def _to_img(self):
     def _to_img(self):
         image = self["input_img"]
         image = self["input_img"]

+ 3 - 2
paddlex/inference/models_new/text_detection/predictor.py

@@ -95,7 +95,7 @@ class TextDetPredictor(BasicPredictor):
         unclip_ratio: Union[float, None] = None,
         unclip_ratio: Union[float, None] = None,
     ):
     ):
 
 
-        batch_raw_imgs = self.pre_tfs["Read"](imgs=batch_data)
+        batch_raw_imgs = self.pre_tfs["Read"](imgs=batch_data.instances)
         batch_imgs, batch_shapes = self.pre_tfs["Resize"](
         batch_imgs, batch_shapes = self.pre_tfs["Resize"](
             imgs=batch_raw_imgs,
             imgs=batch_raw_imgs,
             limit_side_len=limit_side_len or self.limit_side_len,
             limit_side_len=limit_side_len or self.limit_side_len,
@@ -113,7 +113,8 @@ class TextDetPredictor(BasicPredictor):
             unclip_ratio=unclip_ratio or self.unclip_ratio,
             unclip_ratio=unclip_ratio or self.unclip_ratio,
         )
         )
         return {
         return {
-            "input_path": batch_data,
+            "input_path": batch_data.input_paths,
+            "page_index": batch_data.page_indexes,
             "input_img": batch_raw_imgs,
             "input_img": batch_raw_imgs,
             "dt_polys": polys,
             "dt_polys": polys,
             "dt_scores": scores,
             "dt_scores": scores,

+ 9 - 2
paddlex/inference/models_new/text_detection/result.py

@@ -15,14 +15,21 @@
 import copy
 import copy
 import numpy as np
 import numpy as np
 import cv2
 import cv2
+from pathlib import Path
 
 
 from ...common.result import BaseCVResult, StrMixin, JsonMixin
 from ...common.result import BaseCVResult, StrMixin, JsonMixin
 
 
 
 
 class TextDetResult(BaseCVResult):
 class TextDetResult(BaseCVResult):
 
 
-    def __init__(self, data):
-        super().__init__(data)
+    def _get_input_fn(self):
+        fn = super()._get_input_fn()
+        if (page_idx := self["page_index"]) is not None:
+            fp = Path(fn)
+            stem, suffix = fp.stem, fp.suffix
+            return f"{stem}_{page_idx}{suffix}"
+        else:
+            return fn
 
 
     def _to_img(self):
     def _to_img(self):
         """draw rectangle"""
         """draw rectangle"""

+ 3 - 2
paddlex/inference/models_new/text_recognition/predictor.py

@@ -67,13 +67,14 @@ class TextRecPredictor(BasicPredictor):
         return pre_tfs, infer, post_op
         return pre_tfs, infer, post_op
 
 
     def process(self, batch_data):
     def process(self, batch_data):
-        batch_raw_imgs = self.pre_tfs["Read"](imgs=batch_data)
+        batch_raw_imgs = self.pre_tfs["Read"](imgs=batch_data.instances)
         batch_imgs = self.pre_tfs["ReisizeNorm"](imgs=batch_raw_imgs)
         batch_imgs = self.pre_tfs["ReisizeNorm"](imgs=batch_raw_imgs)
         x = self.pre_tfs["ToBatch"](imgs=batch_imgs)
         x = self.pre_tfs["ToBatch"](imgs=batch_imgs)
         batch_preds = self.infer(x=x)
         batch_preds = self.infer(x=x)
         texts, scores = self.post_op(batch_preds)
         texts, scores = self.post_op(batch_preds)
         return {
         return {
-            "input_path": batch_data,
+            "input_path": batch_data.input_paths,
+            "page_index": batch_data.page_indexes,
             "input_img": batch_raw_imgs,
             "input_img": batch_raw_imgs,
             "rec_text": texts,
             "rec_text": texts,
             "rec_score": scores,
             "rec_score": scores,

+ 4 - 2
paddlex/inference/pipelines_new/attribute_recognition/pipeline.py

@@ -63,9 +63,11 @@ class AttributeRecPipeline(BasePipeline):
         det_threshold = self.det_threshold if det_threshold is None else det_threshold
         det_threshold = self.det_threshold if det_threshold is None else det_threshold
         cls_threshold = self.cls_threshold if cls_threshold is None else cls_threshold
         cls_threshold = self.cls_threshold if cls_threshold is None else cls_threshold
         for img_id, batch_data in enumerate(self.batch_sampler(input)):
         for img_id, batch_data in enumerate(self.batch_sampler(input)):
-            raw_imgs = self.img_reader(batch_data)
+            raw_imgs = self.img_reader(batch_data.instances)
             all_det_res = list(self.det_model(raw_imgs, threshold=det_threshold))
             all_det_res = list(self.det_model(raw_imgs, threshold=det_threshold))
-            for input_data, raw_img, det_res in zip(batch_data, raw_imgs, all_det_res):
+            for input_data, raw_img, det_res in zip(
+                batch_data.instances, raw_imgs, all_det_res
+            ):
                 cls_res = self.get_cls_result(raw_img, det_res, cls_threshold)
                 cls_res = self.get_cls_result(raw_img, det_res, cls_threshold)
                 yield self.get_final_result(input_data, raw_img, det_res, cls_res)
                 yield self.get_final_result(input_data, raw_img, det_res, cls_res)
 
 

+ 3 - 8
paddlex/inference/pipelines_new/doc_preprocessor/pipeline.py

@@ -163,13 +163,7 @@ class DocPreprocessorPipeline(BasePipeline):
             yield {"error": "the input params for model settings are invalid!"}
             yield {"error": "the input params for model settings are invalid!"}
 
 
         for img_id, batch_data in enumerate(self.batch_sampler(input)):
         for img_id, batch_data in enumerate(self.batch_sampler(input)):
-            if not isinstance(batch_data[0], str):
-                # TODO: add support input_pth for ndarray and pdf
-                input_path = f"{img_id}.jpg"
-            else:
-                input_path = batch_data[0]
-
-            image_array = self.img_reader(batch_data)[0]
+            image_array = self.img_reader(batch_data.instances)[0]
 
 
             if model_settings["use_doc_orientation_classify"]:
             if model_settings["use_doc_orientation_classify"]:
                 pred = next(self.doc_ori_classify_model(image_array))
                 pred = next(self.doc_ori_classify_model(image_array))
@@ -185,7 +179,8 @@ class DocPreprocessorPipeline(BasePipeline):
                 output_img = rot_img
                 output_img = rot_img
 
 
             single_img_res = {
             single_img_res = {
-                "input_path": input_path,
+                "input_path": batch_data.input_paths[0],
+                "page_index": batch_data.page_indexes[0],
                 "input_img": image_array,
                 "input_img": image_array,
                 "model_settings": model_settings,
                 "model_settings": model_settings,
                 "angle": angle,
                 "angle": angle,

+ 3 - 8
paddlex/inference/pipelines_new/formula_recognition/pipeline.py

@@ -234,13 +234,7 @@ class FormulaRecognitionPipeline(BasePipeline):
             yield {"error": "the input params for model settings are invalid!"}
             yield {"error": "the input params for model settings are invalid!"}
 
 
         for img_id, batch_data in enumerate(self.batch_sampler(input)):
         for img_id, batch_data in enumerate(self.batch_sampler(input)):
-            if not isinstance(batch_data[0], str):
-                # TODO: add support input_pth for ndarray and pdf
-                input_path = f"{img_id}.jpg"
-            else:
-                input_path = batch_data[0]
-
-            image_array = self.img_reader(batch_data)[0]
+            image_array = self.img_reader(batch_data.instances)[0]
 
 
             if model_settings["use_doc_preprocessor"]:
             if model_settings["use_doc_preprocessor"]:
                 doc_preprocessor_res = next(
                 doc_preprocessor_res = next(
@@ -301,7 +295,8 @@ class FormulaRecognitionPipeline(BasePipeline):
                     formula_res_list[idx] = formula_rec_res
                     formula_res_list[idx] = formula_rec_res
 
 
             single_img_res = {
             single_img_res = {
-                "input_path": input_path,
+                "input_path": batch_data.input_paths[0],
+                "page_index": batch_data.page_indexes[0],
                 "layout_det_res": layout_det_res,
                 "layout_det_res": layout_det_res,
                 "doc_preprocessor_res": doc_preprocessor_res,
                 "doc_preprocessor_res": doc_preprocessor_res,
                 "formula_res_list": formula_res_list,
                 "formula_res_list": formula_res_list,

+ 3 - 8
paddlex/inference/pipelines_new/layout_parsing/pipeline.py

@@ -293,13 +293,7 @@ class LayoutParsingPipeline(BasePipeline):
             yield {"error": "the input params for model settings are invalid!"}
             yield {"error": "the input params for model settings are invalid!"}
 
 
         for img_id, batch_data in enumerate(self.batch_sampler(input)):
         for img_id, batch_data in enumerate(self.batch_sampler(input)):
-            if not isinstance(batch_data[0], str):
-                # TODO: add support input_pth for ndarray and pdf
-                input_path = f"{img_id}.jpg"
-            else:
-                input_path = batch_data[0]
-
-            image_array = self.img_reader(batch_data)[0]
+            image_array = self.img_reader(batch_data.instances)[0]
 
 
             if model_settings["use_doc_preprocessor"]:
             if model_settings["use_doc_preprocessor"]:
                 doc_preprocessor_res = next(
                 doc_preprocessor_res = next(
@@ -393,7 +387,8 @@ class LayoutParsingPipeline(BasePipeline):
                 formula_res_list = []
                 formula_res_list = []
 
 
             single_img_res = {
             single_img_res = {
-                "input_path": input_path,
+                "input_path": batch_data.input_paths[0],
+                "page_index": batch_data.page_indexes[0],
                 "doc_preprocessor_res": doc_preprocessor_res,
                 "doc_preprocessor_res": doc_preprocessor_res,
                 "layout_det_res": layout_det_res,
                 "layout_det_res": layout_det_res,
                 "overall_ocr_res": overall_ocr_res,
                 "overall_ocr_res": overall_ocr_res,

+ 3 - 8
paddlex/inference/pipelines_new/layout_parsing/pipeline_v2.py

@@ -309,13 +309,7 @@ class LayoutParsingPipelineV2(BasePipeline):
             yield {"error": "the input params for model settings are invalid!"}
             yield {"error": "the input params for model settings are invalid!"}
 
 
         for img_id, batch_data in enumerate(self.batch_sampler(input)):
         for img_id, batch_data in enumerate(self.batch_sampler(input)):
-            if not isinstance(batch_data[0], str):
-                # TODO: add support input_pth for ndarray and pdf
-                input_path = f"{img_id}"
-            else:
-                input_path = batch_data[0]
-
-            image_array = self.img_reader(batch_data)[0]
+            image_array = self.img_reader(batch_data.instances)[0]
 
 
             if model_settings["use_doc_preprocessor"]:
             if model_settings["use_doc_preprocessor"]:
                 doc_preprocessor_res = next(
                 doc_preprocessor_res = next(
@@ -452,7 +446,8 @@ class LayoutParsingPipelineV2(BasePipeline):
             ]
             ]
 
 
             single_img_res = {
             single_img_res = {
-                "input_path": input_path,
+                "input_path": batch_data.input_paths[0],
+                "page_index": batch_data.page_indexes[0],
                 "doc_preprocessor_res": doc_preprocessor_res,
                 "doc_preprocessor_res": doc_preprocessor_res,
                 "layout_det_res": layout_det_res,
                 "layout_det_res": layout_det_res,
                 "overall_ocr_res": overall_ocr_res,
                 "overall_ocr_res": overall_ocr_res,

+ 3 - 8
paddlex/inference/pipelines_new/ocr/pipeline.py

@@ -304,13 +304,7 @@ class OCRPipeline(BasePipeline):
             text_rec_score_thresh = self.text_rec_score_thresh
             text_rec_score_thresh = self.text_rec_score_thresh
 
 
         for img_id, batch_data in enumerate(self.batch_sampler(input)):
         for img_id, batch_data in enumerate(self.batch_sampler(input)):
-            if not isinstance(batch_data[0], str):
-                # TODO: add support input_pth for ndarray and pdf
-                input_path = f"{img_id}.jpg"
-            else:
-                input_path = batch_data[0]
-
-            image_array = self.img_reader(batch_data)[0]
+            image_array = self.img_reader(batch_data.instances)[0]
 
 
             if model_settings["use_doc_preprocessor"]:
             if model_settings["use_doc_preprocessor"]:
                 doc_preprocessor_res = next(
                 doc_preprocessor_res = next(
@@ -335,7 +329,8 @@ class OCRPipeline(BasePipeline):
             dt_polys = self._sort_boxes(dt_polys)
             dt_polys = self._sort_boxes(dt_polys)
 
 
             single_img_res = {
             single_img_res = {
-                "input_path": input_path,
+                "input_path": batch_data.input_paths[0],
+                "page_index": batch_data.page_indexes[0],
                 "doc_preprocessor_res": doc_preprocessor_res,
                 "doc_preprocessor_res": doc_preprocessor_res,
                 "dt_polys": dt_polys,
                 "dt_polys": dt_polys,
                 "model_settings": model_settings,
                 "model_settings": model_settings,

+ 15 - 2
paddlex/inference/pipelines_new/ocr/result.py

@@ -29,6 +29,15 @@ from ...common.result import BaseCVResult, StrMixin, JsonMixin
 class OCRResult(BaseCVResult):
 class OCRResult(BaseCVResult):
     """OCR result"""
     """OCR result"""
 
 
+    def _get_input_fn(self):
+        fn = super()._get_input_fn()
+        if (page_idx := self["page_index"]) is not None:
+            fp = Path(fn)
+            stem, suffix = fp.stem, fp.suffix
+            return f"{stem}_{page_idx}{suffix}"
+        else:
+            return fn
+
     def get_minarea_rect(self, points: np.ndarray) -> np.ndarray:
     def get_minarea_rect(self, points: np.ndarray) -> np.ndarray:
         """
         """
         Get the minimum area rectangle for the given points using OpenCV.
         Get the minimum area rectangle for the given points using OpenCV.
@@ -127,13 +136,15 @@ class OCRResult(BaseCVResult):
         """
         """
         data = {}
         data = {}
         data["input_path"] = self["input_path"]
         data["input_path"] = self["input_path"]
+        data["page_index"] = self["page_index"]
         data["model_settings"] = self["model_settings"]
         data["model_settings"] = self["model_settings"]
         if self["model_settings"]["use_doc_preprocessor"]:
         if self["model_settings"]["use_doc_preprocessor"]:
             data["doc_preprocessor_res"] = self["doc_preprocessor_res"].str["res"]
             data["doc_preprocessor_res"] = self["doc_preprocessor_res"].str["res"]
         data["dt_polys"] = self["dt_polys"]
         data["dt_polys"] = self["dt_polys"]
         data["text_det_params"] = self["text_det_params"]
         data["text_det_params"] = self["text_det_params"]
         data["text_type"] = self["text_type"]
         data["text_type"] = self["text_type"]
-        data["textline_orientation_angles"] = self["textline_orientation_angles"]
+        if "textline_orientation_angles" in self:
+            data["textline_orientation_angles"] = self["textline_orientation_angles"]
         data["text_rec_score_thresh"] = self["text_rec_score_thresh"]
         data["text_rec_score_thresh"] = self["text_rec_score_thresh"]
         data["rec_texts"] = self["rec_texts"]
         data["rec_texts"] = self["rec_texts"]
         data["rec_scores"] = self["rec_scores"]
         data["rec_scores"] = self["rec_scores"]
@@ -155,13 +166,15 @@ class OCRResult(BaseCVResult):
         """
         """
         data = {}
         data = {}
         data["input_path"] = self["input_path"]
         data["input_path"] = self["input_path"]
+        data["page_index"] = self["page_index"]
         data["model_settings"] = self["model_settings"]
         data["model_settings"] = self["model_settings"]
         if self["model_settings"]["use_doc_preprocessor"]:
         if self["model_settings"]["use_doc_preprocessor"]:
             data["doc_preprocessor_res"] = self["doc_preprocessor_res"].json["res"]
             data["doc_preprocessor_res"] = self["doc_preprocessor_res"].json["res"]
         data["dt_polys"] = self["dt_polys"]
         data["dt_polys"] = self["dt_polys"]
         data["text_det_params"] = self["text_det_params"]
         data["text_det_params"] = self["text_det_params"]
         data["text_type"] = self["text_type"]
         data["text_type"] = self["text_type"]
-        data["textline_orientation_angles"] = self["textline_orientation_angles"]
+        if "textline_orientation_angles" in self:
+            data["textline_orientation_angles"] = self["textline_orientation_angles"]
         data["text_rec_score_thresh"] = self["text_rec_score_thresh"]
         data["text_rec_score_thresh"] = self["text_rec_score_thresh"]
         data["rec_texts"] = self["rec_texts"]
         data["rec_texts"] = self["rec_texts"]
         data["rec_scores"] = self["rec_scores"]
         data["rec_scores"] = self["rec_scores"]

+ 24 - 18
paddlex/inference/pipelines_new/seal_recognition/pipeline.py

@@ -75,11 +75,21 @@ class SealRecognitionPipeline(BasePipeline):
                 layout_kwargs["threshold"] = threshold
                 layout_kwargs["threshold"] = threshold
             if (layout_nms := layout_det_config.get("layout_nms", None)) is not None:
             if (layout_nms := layout_det_config.get("layout_nms", None)) is not None:
                 layout_kwargs["layout_nms"] = layout_nms
                 layout_kwargs["layout_nms"] = layout_nms
-            if (layout_unclip_ratio := layout_det_config.get("layout_unclip_ratio", None)) is not None:
+            if (
+                layout_unclip_ratio := layout_det_config.get(
+                    "layout_unclip_ratio", None
+                )
+            ) is not None:
                 layout_kwargs["layout_unclip_ratio"] = layout_unclip_ratio
                 layout_kwargs["layout_unclip_ratio"] = layout_unclip_ratio
-            if (layout_merge_bboxes_mode := layout_det_config.get("layout_merge_bboxes_mode", None)) is not None:
+            if (
+                layout_merge_bboxes_mode := layout_det_config.get(
+                    "layout_merge_bboxes_mode", None
+                )
+            ) is not None:
                 layout_kwargs["layout_merge_bboxes_mode"] = layout_merge_bboxes_mode
                 layout_kwargs["layout_merge_bboxes_mode"] = layout_merge_bboxes_mode
-            self.layout_det_model = self.create_model(layout_det_config, **layout_kwargs)
+            self.layout_det_model = self.create_model(
+                layout_det_config, **layout_kwargs
+            )
         seal_ocr_config = config.get("SubPipelines", {}).get(
         seal_ocr_config = config.get("SubPipelines", {}).get(
             "SealOCR", {"pipeline_config_error": "config error for seal_ocr_pipeline!"}
             "SealOCR", {"pipeline_config_error": "config error for seal_ocr_pipeline!"}
         )
         )
@@ -185,13 +195,7 @@ class SealRecognitionPipeline(BasePipeline):
             yield {"error": "the input params for model settings are invalid!"}
             yield {"error": "the input params for model settings are invalid!"}
 
 
         for img_id, batch_data in enumerate(self.batch_sampler(input)):
         for img_id, batch_data in enumerate(self.batch_sampler(input)):
-            if not isinstance(batch_data[0], str):
-                # TODO: add support input_pth for ndarray and pdf
-                input_path = f"{img_id}.jpg"
-            else:
-                input_path = batch_data[0]
-
-            image_array = self.img_reader(batch_data)[0]
+            image_array = self.img_reader(batch_data.instances)[0]
 
 
             if model_settings["use_doc_preprocessor"]:
             if model_settings["use_doc_preprocessor"]:
                 doc_preprocessor_res = next(
                 doc_preprocessor_res = next(
@@ -226,14 +230,15 @@ class SealRecognitionPipeline(BasePipeline):
                 seal_region_id += 1
                 seal_region_id += 1
             else:
             else:
                 if model_settings["use_layout_detection"]:
                 if model_settings["use_layout_detection"]:
-                    layout_det_res = next(self.layout_det_model(
-                        doc_preprocessor_image,
-                        threshold=layout_threshold,
-                        layout_nms=layout_nms,
-                        layout_unclip_ratio=layout_unclip_ratio,
-                        layout_merge_bboxes_mode=layout_merge_bboxes_mode
+                    layout_det_res = next(
+                        self.layout_det_model(
+                            doc_preprocessor_image,
+                            threshold=layout_threshold,
+                            layout_nms=layout_nms,
+                            layout_unclip_ratio=layout_unclip_ratio,
+                            layout_merge_bboxes_mode=layout_merge_bboxes_mode,
+                        )
                     )
                     )
-                )
 
 
                 for box_info in layout_det_res["boxes"]:
                 for box_info in layout_det_res["boxes"]:
                     if box_info["label"].lower() in ["seal"]:
                     if box_info["label"].lower() in ["seal"]:
@@ -257,7 +262,8 @@ class SealRecognitionPipeline(BasePipeline):
                         seal_region_id += 1
                         seal_region_id += 1
 
 
             single_img_res = {
             single_img_res = {
-                "input_path": input_path,
+                "input_path": batch_data.input_paths[0],
+                "page_index": batch_data.page_indexes[0],
                 "doc_preprocessor_res": doc_preprocessor_res,
                 "doc_preprocessor_res": doc_preprocessor_res,
                 "layout_det_res": layout_det_res,
                 "layout_det_res": layout_det_res,
                 "seal_res_list": seal_res_list,
                 "seal_res_list": seal_res_list,

+ 3 - 8
paddlex/inference/pipelines_new/table_recognition/pipeline.py

@@ -300,13 +300,7 @@ class TableRecognitionPipeline(BasePipeline):
             yield {"error": "the input params for model settings are invalid!"}
             yield {"error": "the input params for model settings are invalid!"}
 
 
         for img_id, batch_data in enumerate(self.batch_sampler(input)):
         for img_id, batch_data in enumerate(self.batch_sampler(input)):
-            if not isinstance(batch_data[0], str):
-                # TODO: add support input_pth for ndarray and pdf
-                input_path = f"{img_id}"
-            else:
-                input_path = batch_data[0]
-
-            image_array = self.img_reader(batch_data)[0]
+            image_array = self.img_reader(batch_data.instances)[0]
 
 
             if model_settings["use_doc_preprocessor"]:
             if model_settings["use_doc_preprocessor"]:
                 doc_preprocessor_res = next(
                 doc_preprocessor_res = next(
@@ -368,7 +362,8 @@ class TableRecognitionPipeline(BasePipeline):
                         table_region_id += 1
                         table_region_id += 1
 
 
             single_img_res = {
             single_img_res = {
-                "input_path": input_path,
+                "input_path": batch_data.input_paths[0],
+                "page_index": batch_data.page_indexes[0],
                 "doc_preprocessor_res": doc_preprocessor_res,
                 "doc_preprocessor_res": doc_preprocessor_res,
                 "layout_det_res": layout_det_res,
                 "layout_det_res": layout_det_res,
                 "overall_ocr_res": overall_ocr_res,
                 "overall_ocr_res": overall_ocr_res,

+ 3 - 8
paddlex/inference/pipelines_new/table_recognition/pipeline_v2.py

@@ -368,13 +368,7 @@ class TableRecognitionPipelineV2(BasePipeline):
             yield {"error": "the input params for model settings are invalid!"}
             yield {"error": "the input params for model settings are invalid!"}
 
 
         for img_id, batch_data in enumerate(self.batch_sampler(input)):
         for img_id, batch_data in enumerate(self.batch_sampler(input)):
-            if not isinstance(batch_data[0], str):
-                # TODO: add support input_pth for ndarray and pdf
-                input_path = f"{img_id}"
-            else:
-                input_path = batch_data[0]
-
-            image_array = self.img_reader(batch_data)[0]
+            image_array = self.img_reader(batch_data.instances)[0]
 
 
             if model_settings["use_doc_preprocessor"]:
             if model_settings["use_doc_preprocessor"]:
                 doc_preprocessor_res = next(
                 doc_preprocessor_res = next(
@@ -436,7 +430,8 @@ class TableRecognitionPipelineV2(BasePipeline):
                         table_region_id += 1
                         table_region_id += 1
 
 
             single_img_res = {
             single_img_res = {
-                "input_path": input_path,
+                "input_path": batch_data.input_paths[0],
+                "page_index": batch_data.page_indexes[0],
                 "doc_preprocessor_res": doc_preprocessor_res,
                 "doc_preprocessor_res": doc_preprocessor_res,
                 "layout_det_res": layout_det_res,
                 "layout_det_res": layout_det_res,
                 "overall_ocr_res": overall_ocr_res,
                 "overall_ocr_res": overall_ocr_res,

+ 15 - 2
paddlex/inference/pipelines_new/table_recognition/result.py

@@ -30,6 +30,15 @@ class SingleTableRecognitionResult(BaseCVResult, HtmlMixin, XlsxMixin):
         HtmlMixin.__init__(self)
         HtmlMixin.__init__(self)
         XlsxMixin.__init__(self)
         XlsxMixin.__init__(self)
 
 
+    def _get_input_fn(self):
+        fn = super()._get_input_fn()
+        if (page_idx := self["page_index"]) is not None:
+            fp = Path(fn)
+            stem, suffix = fp.stem, fp.suffix
+            return f"{stem}_{page_idx}{suffix}"
+        else:
+            return fn
+
     def _to_html(self) -> Dict[str, str]:
     def _to_html(self) -> Dict[str, str]:
         """Converts the prediction to its corresponding HTML representation.
         """Converts the prediction to its corresponding HTML representation.
 
 
@@ -101,7 +110,9 @@ class TableRecognitionResult(BaseCVResult, HtmlMixin, XlsxMixin):
         res_img_dict.update(**self["overall_ocr_res"].img)
         res_img_dict.update(**self["overall_ocr_res"].img)
 
 
         if len(self["table_res_list"]) > 0:
         if len(self["table_res_list"]) > 0:
-            table_cell_img = Image.fromarray(copy.deepcopy(self["doc_preprocessor_res"]["output_img"]))
+            table_cell_img = Image.fromarray(
+                copy.deepcopy(self["doc_preprocessor_res"]["output_img"])
+            )
             table_draw = ImageDraw.Draw(table_cell_img)
             table_draw = ImageDraw.Draw(table_cell_img)
             rectangle_color = (255, 0, 0)
             rectangle_color = (255, 0, 0)
             for sno in range(len(self["table_res_list"])):
             for sno in range(len(self["table_res_list"])):
@@ -109,7 +120,9 @@ class TableRecognitionResult(BaseCVResult, HtmlMixin, XlsxMixin):
                 cell_box_list = table_res["cell_box_list"]
                 cell_box_list = table_res["cell_box_list"]
                 for box in cell_box_list:
                 for box in cell_box_list:
                     x1, y1, x2, y2 = [int(pos) for pos in box]
                     x1, y1, x2, y2 = [int(pos) for pos in box]
-                    table_draw.rectangle([x1, y1, x2, y2], outline=rectangle_color, width=2)
+                    table_draw.rectangle(
+                        [x1, y1, x2, y2], outline=rectangle_color, width=2
+                    )
             res_img_dict["table_cell_img"] = table_cell_img
             res_img_dict["table_cell_img"] = table_cell_img
         return res_img_dict
         return res_img_dict