liuhongen1234567 пре 10 месеци
родитељ
комит
7dc9ba6389

+ 30 - 0
api_examples/pipelines/test_video_classification.py

@@ -0,0 +1,30 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlex import create_pipeline
+
+pipeline = create_pipeline(pipeline="video_classification")
+
+output = pipeline.predict("./test_samples/general_video_classification_001.mp4", topk=8)
+# input_list = ["./test_samples/general_video_classification_001.mp4",
+#                 "./test_samples/__lt03EF4ao.mkv",
+#                 "./test_samples/__u6odV5hrI.mp4",
+#                 "./test_samples/_8c2EG7JDOw.mp4.webm",]
+# output = pipeline.predict(input_list, topk=5)
+
+for res in output:
+    print(res)
+    res.print()  ## 打印预测的结构化输出
+    res.save_to_video("./output/")  ## 保存结果可视化视频
+    res.save_to_json("./output/")  ## 保存预测的结构化输出

+ 9 - 0
paddlex/configs/pipelines/video_classification.yaml

@@ -0,0 +1,9 @@
+pipeline_name: video_classification
+
+SubModules:
+  VideoClassification:
+    module_name: video_classification
+    model_name: PP-TSMv2-LCNetV2_8frames_uniform
+    model_dir: null
+    batch_size: 1    
+    topk: 1

+ 0 - 3
paddlex/inference/common/result/base_video_result.py

@@ -14,7 +14,6 @@
 
 from .base_result import BaseResult
 from .mixin import StrMixin, JsonMixin, ImgMixin, VideoMixin
-from ...utils.io import VideoReader, VideoWriter
 
 
 class BaseVideoResult(BaseResult, StrMixin, JsonMixin, VideoMixin):
@@ -32,8 +31,6 @@ class BaseVideoResult(BaseResult, StrMixin, JsonMixin, VideoMixin):
         Raises:
             AssertionError: If the required key (`BaseVideoResult.INPUT_IMG_KEY`) are not found in the data.
         """
-        self._video_reader = VideoReader(backend="decord")
-        self._video_writer = VideoWriter(backend="opencv")
 
         super().__init__(data)
         StrMixin.__init__(self)

+ 4 - 5
paddlex/inference/common/result/mixin.py

@@ -456,7 +456,7 @@ class XlsxMixin:
 
 class VideoMixin:
     def __init__(self, backend="opencv", *args, **kwargs):
-        self._video_writer = VideoWriter(backend=backend, *args, **kwargs)
+        self._backend = backend
         self._save_funcs.append(self.save_to_video)
 
     @abstractmethod
@@ -469,9 +469,8 @@ class VideoMixin:
         return video
 
     def save_to_video(self, save_path, *args, **kwargs):
-        if not str(save_path).lower().endswith((".mp4", ".avi", ".mkv")):
+        video_writer = VideoWriter(backend=self._backend, *args, **kwargs)
+        if not str(save_path).lower().endswith((".mp4", ".avi", ".mkv", ".webm")):
             fp = Path(self["input_path"])
             save_path = Path(save_path) / f"{fp.stem}{fp.suffix}"
-        _save_list_data(
-            self._video_writer.write, save_path, self.video, *args, **kwargs
-        )
+        _save_list_data(video_writer.write, save_path, self.video, *args, **kwargs)

+ 4 - 4
paddlex/inference/models_new/video_classification/processors.py

@@ -347,9 +347,9 @@ class VideoClasTopk:
         Returns:
             np.ndarray: The softmax-transformed data.
         """
-        exp_data = np.exp(data - np.max(data))
-        softmax_data = exp_data / np.sum(exp_data)
-        return softmax_data
+        x_max = np.max(data, axis=-1, keepdims=True)
+        e_x = np.exp(data - x_max)
+        return e_x / np.sum(e_x, axis=-1, keepdims=True)
 
     def _parse_class_id_map(
         self, class_ids: Optional[Sequence[Union[str, int]]]
@@ -384,7 +384,7 @@ class VideoClasTopk:
                 - A list of arrays of scores for the top-k predictions.
                 - A list of lists of label names for the top-k predictions.
         """
-        preds = self.softmax(preds)
+        preds[0] = self.softmax(preds[0])
         indexes = preds[0].argsort(axis=1)[:, -topk:][:, ::-1].astype("int32")
         scores = [
             np.around(pred[index], decimals=5) for pred, index in zip(preds[0], indexes)

+ 2 - 1
paddlex/inference/models_new/video_classification/result.py

@@ -19,6 +19,7 @@ from PIL import Image, ImageDraw, ImageFont
 
 from ....utils.fonts import PINGFANG_FONT_FILE_PATH
 from ...utils.color_map import get_colormap
+from ...utils.io import VideoReader
 from ...common.result import BaseVideoResult
 
 
@@ -28,7 +29,7 @@ class TopkVideoResult(BaseVideoResult):
         """Draw label on image"""
         labels = self.get("label_names", self["class_ids"])
         label_str = f"{labels[0]} {self['scores'][0]:.2f}"
-        video_reader = self._video_reader
+        video_reader = VideoReader(backend="decord")
         video = video_reader.read(self["input_path"])
         video = list(video)
         write_fps = video_reader.get_fps()

+ 1 - 0
paddlex/inference/pipelines_new/__init__.py

@@ -25,6 +25,7 @@ from .pp_chatocr import PP_ChatOCRv3_Pipeline, PP_ChatOCRv4_Pipeline
 from .image_classification import ImageClassificationPipeline
 from .seal_recognition import SealRecognitionPipeline
 from .table_recognition import TableRecognitionPipeline
+from .video_classification import VideoClassificationPipeline
 
 
 def get_pipeline_path(pipeline_name: str) -> str:

+ 15 - 0
paddlex/inference/pipelines_new/video_classification/__init__.py

@@ -0,0 +1,15 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .pipeline import VideoClassificationPipeline

+ 73 - 0
paddlex/inference/pipelines_new/video_classification/pipeline.py

@@ -0,0 +1,73 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, Optional, Union
+import numpy as np
+from ...utils.pp_option import PaddlePredictorOption
+from ..base import BasePipeline
+
+# [TODO] 待更新models_new到models
+from ...models_new.video_classification.result import TopkVideoResult
+
+
+class VideoClassificationPipeline(BasePipeline):
+    """Video Classification Pipeline"""
+
+    entities = "video_classification"
+
+    def __init__(
+        self,
+        config: Dict,
+        device: str = None,
+        pp_option: PaddlePredictorOption = None,
+        use_hpip: bool = False,
+        hpi_params: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """
+        Initializes the class with given configurations and options.
+
+        Args:
+            config (Dict): Configuration dictionary containing model and other parameters.
+            device (str): The device to run the prediction on. Default is None.
+            pp_option (PaddlePredictorOption): Options for PaddlePaddle predictor. Default is None.
+            use_hpip (bool): Whether to use high-performance inference (hpip) for prediction. Defaults to False.
+            hpi_params (Optional[Dict[str, Any]]): HPIP specific parameters. Default is None.
+        """
+        super().__init__(
+            device=device, pp_option=pp_option, use_hpip=use_hpip, hpi_params=hpi_params
+        )
+
+        video_classification_model_config = config["SubModules"]["VideoClassification"]
+        self.video_classification_model = self.create_model(
+            video_classification_model_config
+        )
+
+    def predict(
+        self,
+        input: str | list[str] | np.ndarray | list[np.ndarray],
+        topk: Union[int, None] = 1,
+        **kwargs
+    ) -> TopkVideoResult:
+        """Predicts video classification results for the given input.
+
+        Args:
+            input (str | list[str] | np.ndarray | list[np.ndarray]): The input image(s) or path(s) to the images.
+            topk: Union[int, None]: The number of top predictions to return. Defaults to 1.
+            **kwargs: Additional keyword arguments that can be passed to the function.
+
+        Returns:
+            TopkVideoResult: The predicted top k results.
+        """
+
+        yield from self.video_classification_model(input, topk=topk)