Zhang Zelun 10 bulan lalu
induk
melakukan
1830873160
20 mengubah file dengan 1922 tambahan dan 0 penghapusan
  1. 2 0
      paddlex/inference/models_new/__init__.py
  2. 1 0
      paddlex/inference/models_new/common/tokenizer/__init__.py
  3. 629 0
      paddlex/inference/models_new/common/tokenizer/bert_tokenizer.py
  4. 77 0
      paddlex/inference/models_new/common/tokenizer/tokenizer_utils.py
  5. 15 0
      paddlex/inference/models_new/open_vocabulary_detection/__init__.py
  6. 152 0
      paddlex/inference/models_new/open_vocabulary_detection/predictor.py
  7. 15 0
      paddlex/inference/models_new/open_vocabulary_detection/processors/__init__.py
  8. 425 0
      paddlex/inference/models_new/open_vocabulary_detection/processors/groundingdino_processors.py
  9. 15 0
      paddlex/inference/models_new/open_vocabulary_segmentation/__init__.py
  10. 124 0
      paddlex/inference/models_new/open_vocabulary_segmentation/predictor.py
  11. 15 0
      paddlex/inference/models_new/open_vocabulary_segmentation/processors/__init__.py
  12. 232 0
      paddlex/inference/models_new/open_vocabulary_segmentation/processors/sam_processer.py
  13. 15 0
      paddlex/inference/models_new/open_vocabulary_segmentation/results/__init__.py
  14. 138 0
      paddlex/inference/models_new/open_vocabulary_segmentation/results/sam_result.py
  15. 1 0
      paddlex/inference/utils/new_ir_blacklist.py
  16. 3 0
      paddlex/inference/utils/official_models.py
  17. 13 0
      paddlex/modules/open_vocabulary_detection/__init__.py
  18. 18 0
      paddlex/modules/open_vocabulary_detection/model_list.py
  19. 13 0
      paddlex/modules/open_vocabulary_segmentation/__init__.py
  20. 19 0
      paddlex/modules/open_vocabulary_segmentation/model_list.py

+ 2 - 0
paddlex/inference/models_new/__init__.py

@@ -35,6 +35,8 @@ from .ts_anomaly_detection import TSAdPredictor
 from .ts_classification import TSClsPredictor
 from .image_unwarping import WarpPredictor
 from .image_multilabel_classification import MLClasPredictor
+from .open_vocabulary_detection import OVDetPredictor
+from .open_vocabulary_segmentation import OVSegPredictor
 
 
 # from .table_recognition import TablePredictor

+ 1 - 0
paddlex/inference/models_new/common/tokenizer/__init__.py

@@ -14,3 +14,4 @@
 
 from .tokenizer_utils import PretrainedTokenizer
 from .gpt_tokenizer import GPTTokenizer
+from .bert_tokenizer import BertTokenizer

+ 629 - 0
paddlex/inference/models_new/common/tokenizer/bert_tokenizer.py

@@ -0,0 +1,629 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unicodedata
+
+from .tokenizer_utils import (
+    PretrainedTokenizer,
+    _is_control,
+    _is_punctuation,
+    _is_symbol,
+    _is_whitespace,
+    convert_to_unicode,
+    whitespace_tokenize,
+)
+
+__all__ = [
+    "BasicTokenizer",
+    "BertTokenizer",
+    "WordpieceTokenizer",
+]
+
+
+class BasicTokenizer(object):
+    """
+    Runs basic tokenization (punctuation splitting, lower casing, etc.).
+
+    Args:
+        do_lower_case (bool):
+            Whether to lowercase the input when tokenizing.
+            Defaults to `True`.
+        never_split (Iterable):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (bool):
+            Whether to tokenize Chinese characters.
+        strip_accents: (bool):
+            Whether to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+    """
+
+    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+        """Constructs a BasicTokenizer."""
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+
+    def tokenize(self, text, never_split=None):
+        """
+        Tokenizes a piece of text using basic tokenizer.
+
+        Args:
+            text (str): A piece of text.
+            never_split (List[str]): List of token not to split.
+
+        Returns:
+            list(str): A list of tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import BasicTokenizer
+                basictokenizer = BasicTokenizer()
+                tokens = basictokenizer.tokenize('He was a puppeteer')
+                '''
+                ['he', 'was', 'a', 'puppeteer']
+                '''
+        """
+        text = convert_to_unicode(text)
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """
+        Strips accents from a piece of text.
+        """
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """
+        Splits punctuation on a piece of text.
+        """
+        if never_split is not None and text in never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            # punctuation and symbol should be treat as single char.
+            if _is_punctuation(char) or _is_symbol(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """
+        Adds whitespace around any CJK character.
+        """
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """
+        Checks whether CP is the codepoint of a CJK character.
+        """
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """
+        Performs invalid character removal and whitespace cleanup on text.
+        """
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """
+    Runs WordPiece tokenization.
+
+    Args:
+        vocab (Vocab|dict):
+            Vocab of the word piece tokenizer.
+        unk_token (str):
+            A specific token to replace all unknown tokens.
+        max_input_chars_per_word (int):
+            If a word's length is more than
+            max_input_chars_per_word, it will be dealt as unknown word.
+            Defaults to 100.
+    """
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces.
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through `BasicTokenizer`.
+
+        Returns:
+            list (str): A list of wordpiece tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import BertTokenizer, WordpieceTokenizer
+
+                berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+                vocab  = berttokenizer.vocab
+                unk_token = berttokenizer.unk_token
+
+                wordpiecetokenizer = WordpieceTokenizer(vocab,unk_token)
+                inputs = wordpiecetokenizer.tokenize("unaffable")
+                print(inputs)
+                '''
+                ["un", "##aff", "##able"]
+                '''
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+class BertTokenizer(PretrainedTokenizer):
+    """
+    Constructs a BERT tokenizer. It uses a basic tokenizer to do punctuation
+    splitting, lower casing and so on, and follows a WordPiece tokenizer to
+    tokenize as subwords.
+
+    Args:
+        vocab_file (str):
+            The vocabulary file path (ends with '.txt') required to instantiate
+            a `WordpieceTokenizer`.
+        do_lower_case (bool, optional):
+            Whether to lowercase the input when tokenizing.
+            Defaults to `True`.
+        do_basic_tokenize (bool, optional):
+            Whether to use a basic tokenizer before a WordPiece tokenizer.
+            Defaults to `True`.
+        never_split (Iterable, optional):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`. Defaults to `None`.
+        unk_token (str, optional):
+            A special token representing the *unknown (out-of-vocabulary)* token.
+            An unknown token is set to be `unk_token` inorder to be converted to an ID.
+            Defaults to "[UNK]".
+        sep_token (str, optional):
+            A special token separating two different sentences in the same input.
+            Defaults to "[SEP]".
+        pad_token (str, optional):
+            A special token used to make arrays of tokens the same size for batching purposes.
+            Defaults to "[PAD]".
+        cls_token (str, optional):
+            A special token used for sequence classification. It is the last token
+            of the sequence when built with special tokens. Defaults to "[CLS]".
+        mask_token (str, optional):
+            A special token representing a masked token. This is the token used
+            in the masked language modeling task which the model tries to predict the original unmasked ones.
+            Defaults to "[MASK]".
+        tokenize_chinese_chars (bool, optional):
+            Whether to tokenize Chinese characters.
+            Defaults to `True`.
+        strip_accents: (bool, optional):
+            Whether to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+            Defaults to `None`.
+
+    Examples:
+        .. code-block::
+
+            from paddlenlp.transformers import BertTokenizer
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+
+            inputs = tokenizer('He was a puppeteer')
+            print(inputs)
+
+            '''
+            {'input_ids': [101, 2002, 2001, 1037, 13997, 11510, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0]}
+            '''
+    """
+
+    resource_files_names = {"vocab_file": "vocab.txt"}  # for save_pretrained
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "bert-base-uncased": "https://bj.bcebos.com/paddle-hapi/models/bert/bert-base-uncased-vocab.txt",
+            "bert-large-uncased": "https://bj.bcebos.com/paddle-hapi/models/bert/bert-large-uncased-vocab.txt",
+            "bert-base-cased": "https://bj.bcebos.com/paddle-hapi/models/bert/bert-base-cased-vocab.txt",
+            "bert-large-cased": "https://bj.bcebos.com/paddle-hapi/models/bert/bert-large-cased-vocab.txt",
+            "bert-base-multilingual-uncased": "https://bj.bcebos.com/paddle-hapi/models/bert/bert-base-multilingual-uncased-vocab.txt",
+            "bert-base-multilingual-cased": "https://bj.bcebos.com/paddle-hapi/models/bert/bert-base-multilingual-cased-vocab.txt",
+            "bert-base-chinese": "https://bj.bcebos.com/paddle-hapi/models/bert/bert-base-chinese-vocab.txt",
+            "bert-wwm-chinese": "http://bj.bcebos.com/paddlenlp/models/transformers/bert/bert-wwm-chinese-vocab.txt",
+            "bert-wwm-ext-chinese": "http://bj.bcebos.com/paddlenlp/models/transformers/bert/bert-wwm-ext-chinese-vocab.txt",
+            "macbert-large-chinese": "https://bj.bcebos.com/paddle-hapi/models/bert/bert-base-chinese-vocab.txt",
+            "macbert-base-chinese": "https://bj.bcebos.com/paddle-hapi/models/bert/bert-base-chinese-vocab.txt",
+            "simbert-base-chinese": "https://bj.bcebos.com/paddlenlp/models/transformers/simbert/vocab.txt",
+            "uer/chinese-roberta-base": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_vocab.txt",
+            "uer/chinese-roberta-medium": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_vocab.txt",
+            "uer/chinese-roberta-6l-768h": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_vocab.txt",
+            "uer/chinese-roberta-small": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_vocab.txt",
+            "uer/chinese-roberta-mini": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_vocab.txt",
+            "uer/chinese-roberta-tiny": "https://bj.bcebos.com/paddlenlp/models/transformers/uer/chinese_roberta_vocab.txt",
+        }
+    }
+    pretrained_init_configuration = {
+        "bert-base-uncased": {"do_lower_case": True},
+        "bert-large-uncased": {"do_lower_case": True},
+        "bert-base-cased": {"do_lower_case": False},
+        "bert-large-cased": {"do_lower_case": False},
+        "bert-base-multilingual-uncased": {"do_lower_case": True},
+        "bert-base-multilingual-cased": {"do_lower_case": False},
+        "bert-base-chinese": {"do_lower_case": False},
+        "bert-wwm-chinese": {"do_lower_case": False},
+        "bert-wwm-ext-chinese": {"do_lower_case": False},
+        "macbert-large-chinese": {"do_lower_case": False},
+        "macbert-base-chinese": {"do_lower_case": False},
+        "simbert-base-chinese": {"do_lower_case": True},
+        "uer/chinese-roberta-base": {"do_lower_case": True},
+        "uer/chinese-roberta-medium": {"do_lower_case": True},
+        "uer/chinese-roberta-6l-768h": {"do_lower_case": True},
+        "uer/chinese-roberta-small": {"do_lower_case": True},
+        "uer/chinese-roberta-mini": {"do_lower_case": True},
+        "uer/chinese-roberta-tiny": {"do_lower_case": True},
+    }
+    max_model_input_sizes = {
+        "bert-base-uncased": 512,
+        "bert-large-uncased": 512,
+        "bert-base-cased": 512,
+        "bert-large-cased": 512,
+        "bert-base-multilingual-uncased": 512,
+        "bert-base-multilingual-cased": 512,
+        "bert-base-chinese": 512,
+        "bert-wwm-chinese": 512,
+        "bert-wwm-ext-chinese": 512,
+        "macbert-large-chinese": 512,
+        "macbert-base-chinese": 512,
+        "simbert-base-chinese": 512,
+        "uer/chinese-roberta-base": 512,
+        "uer/chinese-roberta-medium": 512,
+        "uer/chinese-roberta-6l-768h": 512,
+        "uer/chinese-roberta-small": 512,
+        "uer/chinese-roberta-mini": 512,
+        "uer/chinese-roberta-tiny": 512,
+    }
+    padding_side = "right"
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs
+    ):
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the "
+                "vocabulary from a pretrained model please use "
+                "`tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.do_lower_case = do_lower_case
+        self.vocab = self.load_vocabulary(vocab_file, unk_token=unk_token)
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=unk_token)
+
+    @property
+    def vocab_size(self):
+        """
+        Return the size of vocabulary.
+
+        Returns:
+            int: The size of vocabulary.
+        """
+
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab.token_to_idx, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        """
+        End-to-end tokenization for BERT models.
+
+        Args:
+            text (str): The text to be tokenized.
+
+        Returns:
+            list: A list of string representing converted tokens.
+        """
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (list of string) to a single string. Since
+        the usage of WordPiece introducing `##` to concat subwords, also removes
+        `##` when converting.
+
+        Args:
+            tokens (list): A list of string representing tokens to be converted.
+
+        Returns:
+            str: Converted string from tokens.
+
+        Examples:
+            .. code-block::
+
+                from paddlenlp.transformers import BertTokenizer
+
+                berttokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+                tokens = berttokenizer.tokenize('He was a puppeteer')
+                '''
+                ['he', 'was', 'a', 'puppet', '##eer']
+                '''
+                strings = tokenizer.convert_tokens_to_string(tokens)
+                '''
+                he was a puppeteer
+                '''
+        """
+
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def num_special_tokens_to_add(self, pair=False):
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Args:
+            pair(bool):
+                Whether the input is a sequence pair or a single sequence.
+                Defaults to `False` and the input is a single sequence.
+
+        Returns:
+            int: Number of tokens added to sequences.
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.
+
+        A BERT sequence has the following format:
+
+        - single sequence:      ``[CLS] X [SEP]``
+        - pair of sequences:        ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (List[int]):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of input_id with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        return _cls + token_ids_0 + _sep + token_ids_1 + _sep
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        """
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens.
+
+        A BERT offset_mapping has the following format:
+
+        - single sequence:      ``(0,0) X (0,0)``
+        - pair of sequences:        ``(0,0) A (0,0) B (0,0)``
+
+        Args:
+            offset_mapping_ids_0 (List[tuple]):
+                List of wordpiece offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (List[tuple], optional):
+                Optional second list of wordpiece offsets for offset mapping pairs. Defaults to None.
+
+        Returns:
+            List[tuple]: A list of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
+
+        A BERT sequence pair mask has the following format:
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+
+        Returns:
+            List[int]: List of token_type_id according to the given sequence(s).
+        """
+        _sep = [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(_cls + token_ids_0 + _sep) * [0]
+        return len(_cls + token_ids_0 + _sep) * [0] + len(token_ids_1 + _sep) * [1]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``encode`` methods.
+
+        Args:
+            token_ids_0 (List[int]):
+                A list of `inputs_ids` for the first sequence.
+            token_ids_1 (List[int], optional):
+                Optional second list of IDs for sequence pairs. Defaults to None.
+            already_has_special_tokens (bool, optional): Whether or not the token list is already
+                formatted with special tokens for the model. Defaults to None.
+
+        Returns:
+            List[int]: The list of integers either be 0 or 1: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in self.all_special_ids else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.vocab._idx_to_token.get(index, self.unk_token)

+ 77 - 0
paddlex/inference/models_new/common/tokenizer/tokenizer_utils.py

@@ -2046,3 +2046,80 @@ class PretrainedTokenizer(ChatTemplateMixin, PretrainedTokenizerBase):
             return new_text, read_offset, len(all_input_ids)
         else:
             return "", prefix_offset, read_offset
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
+
+def _is_symbol(char):
+    """Check whether CP is the codepoint of a Symbol character."""
+    cp = ord(char)
+    if unicodedata.category(char).startswith("S") or (
+        cp in [0x00AD, 0x00B2, 0x00BA, 0x3007, 0x00B5, 0x00D8, 0x014B, 0x01B1]
+    ):
+        return True
+    return False
+
+def _is_whitespace(char):
+    """
+    Checks whether `chars` is a whitespace character.
+    """
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+def convert_to_unicode(text):
+    """
+    Converts `text` to Unicode (if it's not already), assuming utf-8 input.
+    Args:
+        text (str|bytes): Text to be converted to unicode.
+    Returns:
+        str: converted text.
+    """
+    if isinstance(text, str):
+        return text
+    elif isinstance(text, bytes):
+        return text.decode("utf-8", "ignore")
+    else:
+        raise ValueError("Unsupported string type: %s" % (type(text)))
+
+def whitespace_tokenize(text):
+    """
+    Runs basic whitespace cleaning and splitting on a peice of text.
+    Args:
+        text (str): Text to be tokenized.
+    Returns:
+        list(str): Token list.
+    """
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+    

+ 15 - 0
paddlex/inference/models_new/open_vocabulary_detection/__init__.py

@@ -0,0 +1,15 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .predictor import OVDetPredictor

+ 152 - 0
paddlex/inference/models_new/open_vocabulary_detection/predictor.py

@@ -0,0 +1,152 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Union, Dict, List, Tuple, Optional, Callable
+import numpy as np
+import inspect
+
+from ....utils.func_register import FuncRegister
+from ....modules.open_vocabulary_detection.model_list import MODELS
+from ...common.batch_sampler import ImageBatchSampler
+from ...common.reader import ReadImage
+from .processors import (
+    GroundingDINOProcessor,
+    GroundingDINOPostProcessor
+)
+from ..common import StaticInfer
+from ..base import BasicPredictor
+from ..object_detection.result import DetResult
+
+
+class OVDetPredictor(BasicPredictor):
+
+    entities = MODELS
+
+    _FUNC_MAP = {}
+    register = FuncRegister(_FUNC_MAP)
+
+    def __init__(self, *args, thresholds: Optional[Union[Dict, float]] = None, **kwargs):
+        """Initializes DetPredictor.
+        Args:
+            *args: Arbitrary positional arguments passed to the superclass.
+            thresholds (Optional[Union[Dict, float]], optional): The thresholds for filtering out low-confidence predictions, using a dict to record multiple thresholds
+                Defaults to None.
+            **kwargs: Arbitrary keyword arguments passed to the superclass.
+        """
+        super().__init__(*args, **kwargs)
+        if isinstance(thresholds, float):
+            thresholds = {"threshold": thresholds}
+        self.thresholds = thresholds
+        self.pre_ops, self.infer, self.post_op = self._build()
+
+    def _build_batch_sampler(self):
+        return ImageBatchSampler()
+
+    def _get_result_class(self):
+        return DetResult
+
+    def _build(self):
+        # build model preprocess ops
+        pre_ops = [ReadImage(format="RGB")]
+        for cfg in self.config["Preprocess"]:
+            tf_key = cfg["type"]
+            func = self._FUNC_MAP[tf_key]
+            cfg.pop("type")
+            args = cfg
+            op = func(self, **args) if args else func(self)
+            if op:
+                pre_ops.append(op)
+
+        # build infer
+        infer = StaticInfer(
+            model_dir=self.model_dir,
+            model_prefix=self.MODEL_FILE_PREFIX,
+            option=self.pp_option,
+        )
+
+        # build postprocess op
+        post_op = self.build_postprocess(pre_ops = pre_ops)
+
+        return pre_ops, infer, post_op
+
+    def process(self, batch_data: List[Any], prompt: str, thresholds: Optional[dict] = None):
+        """
+        Process a batch of data through the preprocessing, inference, and postprocessing.
+
+        Args:
+            batch_data (List[str]): A batch of input data (e.g., image file paths).
+            prompt (str): Text prompt for open vocabulary detection.
+            thresholds (Optional[dict]): thresholds used for postprocess.
+
+        Returns:
+            dict: A dictionary containing the input path, raw image, class IDs, scores, and label names
+                for every instance of the batch. Keys include 'input_path', 'input_img', 'class_ids', 'scores', and 'label_names'.
+        """
+        image_paths = batch_data
+        src_images = self.pre_ops[0](batch_data)
+        datas = src_images
+        # preprocess
+        for pre_op in self.pre_ops[1:-1]:
+            datas = pre_op(datas)
+
+        # use Model-specific preprocessor to format batch inputs
+        batch_inputs = self.pre_ops[-1](datas, prompt)
+
+        # do infer
+        batch_preds = self.infer(batch_inputs)
+
+        # postprocess
+        current_thresholds = self._parse_current_thresholds(
+            self.post_op, self.thresholds, thresholds
+        )
+        boxes = self.post_op(
+            *batch_preds, prompt=prompt, src_images=src_images, **current_thresholds
+        )
+
+        return {
+            "input_path": image_paths,
+            "input_img": src_images,
+            "boxes": boxes,
+        }
+
+    def _parse_current_thresholds(self, func, init_thresholds, process_thresholds):
+        assert isinstance(func, Callable)
+        thr2val = {}
+        for name, param in inspect.signature(func).parameters.items():
+            if "threshold" in name:
+                thr2val[name] = None
+        if init_thresholds is not None:
+            thr2val.update(init_thresholds)
+        if process_thresholds is not None:
+            thr2val.update(process_thresholds)
+        return thr2val
+
+    def build_postprocess(self, **kwargs):
+        if "GroundingDINO" in self.model_name:
+            pre_ops = kwargs.get("pre_ops")
+            return GroundingDINOPostProcessor(
+                tokenizer=pre_ops[-1].tokenizer,
+                box_threshold=self.config["box_threshold"],
+                text_threshold=self.config["text_threshold"],
+            )
+        else:
+            raise NotImplementedError
+
+    @register("GroundingDINOProcessor")
+    def build_grounding_dino_preprocessor(self, text_max_words=256, target_size=(800, 1333)):
+        return GroundingDINOProcessor(
+            model_dir=self.model_dir,
+            text_max_words=text_max_words,
+            target_size=target_size
+        )

+ 15 - 0
paddlex/inference/models_new/open_vocabulary_detection/processors/__init__.py

@@ -0,0 +1,15 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .groundingdino_processors import GroundingDINOProcessor, GroundingDINOPostProcessor

+ 425 - 0
paddlex/inference/models_new/open_vocabulary_detection/processors/groundingdino_processors.py

@@ -0,0 +1,425 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Dict, List, Optional, Union, Tuple
+
+import numpy as np
+import PIL
+
+import paddle
+import paddle.vision.transforms as T
+import paddle.nn.functional as F
+from ...common.tokenizer.bert_tokenizer import BertTokenizer
+
+def _max_by_axis(the_list):
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+def _text_pad_batch_data(
+    insts,
+    pad_idx=0,
+    max_seq_len=None,
+    return_pos=False,
+    return_input_mask=False,
+    return_max_len=False,
+    return_num_token=False,
+    return_seq_lens=False,
+    pad_2d_pos_ids=False,
+    pad_segment_id=False,
+    select=False,
+    extract=False,
+):
+    """Pad the instances to the max sequence length in batch, and generate the
+       corresponding position data and attention bias.
+    """
+    return_list = []
+    max_len = max(len(inst) for inst in insts) if max_seq_len is None else max_seq_len
+    if pad_2d_pos_ids:
+        boxes = [x + [[0, 0, 0, 0]] * (max_len - len(x)) for x in insts]
+        boxes = np.array(boxes, dtype="int64")
+        return boxes
+
+    inst_data = np.array([inst + list([pad_idx] * (max_len - len(inst))) for inst in insts])
+    return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])]
+
+    if return_pos:
+        inst_pos = np.array([list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst)) for inst in insts])
+
+        return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])]
+
+    if return_input_mask:
+        input_mask_data = np.array([[1] * len(inst) + [0] * (max_len - len(inst)) for inst in insts])
+        input_mask_data = np.expand_dims(input_mask_data, axis=-1)
+        return_list += [input_mask_data.astype("float32")]
+
+    if return_max_len:
+        return_list += [max_len]
+
+    if return_num_token:
+        num_token = 0
+        for inst in insts:
+            num_token += len(inst)
+        return_list += [num_token]
+
+    if return_seq_lens:
+        seq_lens = np.array([len(inst) for inst in insts])
+        return_list += [seq_lens.astype("int64").reshape([-1, 1])]
+
+    return return_list if len(return_list) > 1 else return_list[0]
+
+
+class GroundingDINOPostProcessor(object):
+    """PostProcessors for GroundingDINO
+    """
+    def __init__(
+        self,
+        tokenizer,
+        box_threshold: float = 0.3,
+        text_threshold: float = 0.25,
+    ):
+        """Init Function for GroundingDINO PostProcessor
+
+        Args:
+            tokenzier (BertTokenizer): tokenzier used for prompt tokenize.
+            box_threshold (float): threshold for low confidence bbox filtering.
+            text_threshold (float): threshold for determining predicted bbox text labels.
+        """
+        self.tokenizer = tokenizer
+        self.box_threshold = box_threshold
+        self.text_threshold = text_threshold
+
+    def __call__(self, pred_boxes, pred_logits, prompt, src_images, box_threshold = None, text_threshold = None, **kwargs):
+        
+        box_threshold = self.box_threshold if box_threshold is None else box_threshold
+        text_threshold = self.text_threshold if text_threshold is None else text_threshold
+
+        if isinstance(pred_logits, np.ndarray):
+            pred_logits = paddle.to_tensor(pred_logits)
+        if isinstance(pred_boxes, np.ndarray):
+            pred_boxes = paddle.to_tensor(pred_boxes)
+
+        assert pred_logits.ndim == 3 and pred_boxes.ndim == 3
+        assert pred_logits.shape[0] == pred_boxes.shape[0] == len(src_images)
+
+        rst_boxes = []
+        for pred_logit, pred_box, src_image in zip(pred_logits, pred_boxes, src_images):
+            rst_boxes.append(
+                self.postprocess(pred_logit, pred_box, prompt, src_image, box_threshold, text_threshold)
+            )
+
+        return rst_boxes
+        
+    def postprocess(self, pred_logits, pred_boxes, src_prompt, src_image, box_threshold, text_threshold):
+        """Post Process for prediction result of single image.
+        """
+
+        logits = F.sigmoid(pred_logits) 
+        boxes = pred_boxes
+
+        logits_filt = logits.clone()
+        boxes_filt = boxes.clone()
+        filt_mask = logits_filt.max(axis=1) > box_threshold
+        logits_filt = logits_filt[filt_mask]
+        boxes_filt = boxes_filt[filt_mask]
+
+        H, W, *_ = src_image.shape
+
+        pred_bboxes = []
+        for logit, box in zip(logits_filt, boxes_filt):
+            pred_phrase = self.decode(logit > text_threshold, src_prompt)
+            pred_score = logit.max().item()
+            box[:2] -= box[2:] / 2
+            box[2:] += box[:2]
+            box *= paddle.to_tensor([W, H, W, H]).astype(paddle.float32)
+            pred_bboxes.append(
+                {
+                    "coordinate": box.detach().cpu().tolist(),
+                    "label": pred_phrase,
+                    "score": pred_score
+                }
+            )
+
+        return pred_bboxes
+
+    def decode(self, posmap, prompt):
+
+        tokenized = self.tokenizer(prompt)
+        if posmap.dim() == 1:
+            non_zero_idx = posmap.nonzero(as_tuple=True)[0].squeeze(-1).tolist()
+            token_ids = [tokenized["input_ids"][i] for i in non_zero_idx]
+            return self.tokenizer.decode(token_ids)
+        else:
+            raise NotImplementedError("posmap must be 1-dim")
+
+
+class GroundingDINOProcessor(object):
+    """Image and Text Processors for GroundingDINO
+    """
+
+    def __init__(
+        self,
+        model_dir,
+        text_max_words: int = 256,
+        image_do_resize: bool = True,
+        image_target_size: Union[Tuple[int], int] = (800, 1333),
+        image_do_normalize: bool = True,
+        image_mean: Union[float, List[float]] = [0.485, 0.456, 0.406],
+        image_std: Union[float, List[float]] = [0.229, 0.224, 0.225],
+        image_do_nested: bool = True,
+        **kwargs,
+    ):
+        self.text_processor = GroundingDinoTextProcessor(text_max_words)
+        self.image_processor = GroundingDinoImageProcessor(
+            do_resize=image_do_resize,
+            target_size=image_target_size,
+            do_normalize=image_do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_nested=image_do_nested,
+        )
+        tokenizer_dir = os.path.join(model_dir, 'tokenizer')
+        assert os.path.isdir(tokenizer_dir), \
+            f'{tokenizer_dir} not exists.'
+        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_dir)
+
+    def __call__(
+        self,
+        images: List[PIL.Image.Image],
+        text: str,
+        **kwargs,
+    ):
+            
+        self.prompt = self.text_processor.pre_caption(text)
+        input_ids = self.tokenizer([self.prompt]).input_ids 
+        special_tokens = self.tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"])
+        tokenized_out = self.text_processor(input_ids, special_tokens)
+
+        image_tensor, mask = self.image_processor(images)
+
+        paddle_rst = [
+            tokenized_out['attention_mask'], tokenized_out['input_ids'], mask, tokenized_out['position_ids'], tokenized_out['text_self_attention_masks'], image_tensor
+        ]
+        return [arr.numpy() for arr in paddle_rst]
+
+
+class GroundingDinoTextProcessor(object):
+    """Constructs a GroundingDino text processor.
+    """
+    def __init__(
+        self,
+        max_words: int = 256,
+    ):
+        self.max_words = max_words
+
+    def __call__(
+        self,
+        input_ids,
+        special_tokens_list,
+    ):
+        """Preprocess the text with tokenization.
+        """
+        tokenized_out = {}
+        input_ids = _text_pad_batch_data(input_ids)
+        input_ids = paddle.to_tensor(input_ids, dtype=paddle.int64).squeeze(-1)
+        tokenized_out["input_ids"] = input_ids 
+        tokenized_out["attention_mask"] = paddle.cast(input_ids != 0, paddle.int64)
+
+        (
+            text_self_attention_masks,
+            position_ids,
+            cate_to_token_mask_list,
+        ) = self.generate_masks_with_special_tokens_and_transfer_map(tokenized_out, special_tokens_list)
+
+        if text_self_attention_masks.shape[1] > self.max_words:
+            text_self_attention_masks = text_self_attention_masks[:, : self.max_words, : self.max_words]
+            position_ids = position_ids[:, : self.max_words]
+            tokenized_out["input_ids"] = tokenized_out["input_ids"][:, : self.max_words]
+            tokenized_out["attention_mask"] = tokenized_out["attention_mask"][:, : self.max_words]
+        tokenized_out["position_ids"] = position_ids
+        tokenized_out["text_self_attention_masks"] = text_self_attention_masks
+
+        return tokenized_out
+
+    def pre_caption(self, caption: str) -> str:
+        """Preprocess the text before tokenization.
+        """
+        caption = caption.strip()
+        if not caption.endswith("."):
+            caption = caption + "."
+        return caption
+
+    def generate_masks_with_special_tokens_and_transfer_map(self, tokenized, special_tokens_list):
+        """Generate attention mask between each pair of special tokens
+        Args:
+            input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
+            special_tokens_mask (list): special tokens mask.
+        Returns:
+            torch.Tensor: attention mask between each special tokens.
+        """
+        input_ids = tokenized["input_ids"]
+        bs, num_token = input_ids.shape
+        special_tokens_mask = paddle.zeros((bs, num_token), dtype=paddle.bool)
+        for special_token in special_tokens_list:
+            special_tokens_mask |= input_ids == special_token
+
+        idxs = paddle.nonzero(special_tokens_mask)
+
+        attention_mask = paddle.eye(num_token, dtype=paddle.int32).cast(paddle.bool).unsqueeze(0).tile([bs, 1, 1])
+        position_ids = paddle.zeros((bs, num_token), dtype=paddle.int64)
+        cate_to_token_mask_list = [[] for _ in range(bs)]
+        previous_col = 0
+
+        for i in range(idxs.shape[0]): 
+            row, col = idxs[i] 
+            if (col == 0) or (col == num_token - 1):
+                attention_mask[row, col, col] = True
+                position_ids[row, col] = 0
+            else:
+                attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+                position_ids[row, previous_col + 1 : col + 1] = paddle.arange(0, col - previous_col)
+                c2t_maski = paddle.zeros(
+                    [
+                        num_token,
+                    ]
+                ).cast(paddle.bool)
+                c2t_maski[previous_col + 1 : col] = True
+                cate_to_token_mask_list[row].append(c2t_maski)
+            previous_col = col 
+
+        return attention_mask, position_ids.cast(paddle.int64), cate_to_token_mask_list
+
+class GroundingDinoImageProcessor(object):
+    """Constructs a GroundingDino image processor.
+    """
+    def __init__(
+        self,
+        do_resize: bool = True,
+        target_size: Union[Tuple[int], int] = (800, 1333),
+        do_normalize: bool = True,
+        image_mean: Union[float, List[float]] = [0.485, 0.456, 0.406],
+        image_std: Union[float, List[float]] = [0.229, 0.224, 0.225],
+        do_nested: bool = True,
+    ) -> None:
+
+        if isinstance(target_size, int):
+            target_size = (target_size, target_size)
+        assert isinstance(target_size, (tuple, list)) and len(target_size) == 2
+        self.target_size = target_size
+
+        self.do_resize = do_resize
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean 
+        self.image_std = image_std
+        self.do_nested = do_nested
+
+    def __call__(self, images, **kwargs):
+        """Preprocess an image or a batch of images."""
+        return self.preprocess(images, **kwargs)
+
+    def resize(self, image, size=None, max_size=1333):
+        """Officially aligned Image resize.
+        """
+        def get_size_with_aspect_ratio(image_size, size, max_size=None):
+            w, h = image_size
+            if max_size is not None:
+                min_original_size = float(min((w, h)))
+                max_original_size = float(max((w, h)))
+                if max_original_size / min_original_size * size > max_size:
+                    size = int(round(max_size * min_original_size / max_original_size))
+
+            if (w <= h and w == size) or (h <= w and h == size):
+                return (h, w)
+
+            if w < h:
+                ow = size
+                oh = int(size * h / w)
+            else:
+                oh = size
+                ow = int(size * w / h)
+
+            return (oh, ow)
+
+        def get_size(image_size, size, max_size=None):
+            if isinstance(size, (list, tuple)):
+                return size[::-1]
+            else:
+                return get_size_with_aspect_ratio(image_size, size, max_size)
+
+        size = get_size(image.size, size, max_size)
+        rescaled_image = T.resize(image, size)
+
+        return rescaled_image
+
+    def nested_tensor_from_tensor_list(self, tensor_list):
+        if tensor_list[0].ndim == 3:
+            max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+            batch_shape = [len(tensor_list)] + max_size
+            b, c, h, w = batch_shape
+            dtype = tensor_list[0].dtype
+            tensor = paddle.zeros(batch_shape, dtype=dtype)
+            mask = paddle.ones((b, h, w), dtype=paddle.bool)
+            for i in range(b):
+                img = tensor_list[i]
+                tensor[i, : img.shape[0], : img.shape[1], : img.shape[2]] = img
+                mask[i, : img.shape[1], : img.shape[2]] = False
+        else:
+            raise ValueError(
+                f"Not supported tensor format of {tensor_list[0].shape}, only support shape like 'CHW' ."
+            )
+        return tensor, mask
+
+    def preprocess(
+        self,
+        images,
+        do_resize: Optional[bool] = None,
+        target_size: Optional[Dict[str, int]] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_nested: bool = None,
+        **kwargs,
+    ):
+        """Preprocess an image or batch of images.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        do_nested = do_nested if do_nested is not None else self.do_nested
+
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        target_size = target_size if target_size is not None else self.target_size
+
+        if not isinstance(images, (list, tuple)):
+            images = [images]
+        if isinstance(images[0], np.ndarray):
+            images = [
+                PIL.Image.fromarray(image) for image in images
+            ]
+
+        if do_resize:
+            min_size = min(self.target_size)
+            max_size = max(self.target_size)
+            images = [T.to_tensor(self.resize(image=image, size=min_size, max_size = max_size)) for image in images]
+
+        if do_normalize:
+            images = T.normalize(images, mean=image_mean, std=image_std)
+
+        if do_nested:
+            tensors, masks = self.nested_tensor_from_tensor_list(images)
+
+        return tensors, masks

+ 15 - 0
paddlex/inference/models_new/open_vocabulary_segmentation/__init__.py

@@ -0,0 +1,15 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .predictor import OVSegPredictor

+ 124 - 0
paddlex/inference/models_new/open_vocabulary_segmentation/predictor.py

@@ -0,0 +1,124 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Any, Union, Dict, List, Tuple, Optional, Callable
+import numpy as np
+import inspect
+
+from ....utils.func_register import FuncRegister
+from ....modules.open_vocabulary_segmentation.model_list import MODELS
+from ...common.batch_sampler import ImageBatchSampler
+from ...common.reader import ReadImage
+from .processors import (
+    SAMProcessor
+)
+from ..common import StaticInfer
+from ..base import BasicPredictor
+from .results import SAMSegResult
+
+
+class OVSegPredictor(BasicPredictor):
+
+    entities = MODELS
+
+    _FUNC_MAP = {}
+    register = FuncRegister(_FUNC_MAP)
+
+    def __init__(self, *args, **kwargs):
+        """Initializes DetPredictor.
+        Args:
+            *args: Arbitrary positional arguments passed to the superclass.
+            **kwargs: Arbitrary keyword arguments passed to the superclass.
+        """
+        super().__init__(*args, **kwargs)
+        self.pre_ops, self.infer, self.processor = self._build()
+
+    def _build_batch_sampler(self):
+        return ImageBatchSampler()
+
+    def _get_result_class(self):
+        return SAMSegResult
+
+    def _build(self):
+        # build model preprocess ops
+        pre_ops = [ReadImage(format="RGB")]
+        for cfg in self.config.get("Preprocess", []):
+            tf_key = cfg["type"]
+            func = self._FUNC_MAP[tf_key]
+            cfg.pop("type")
+            args = cfg
+            op = func(self, **args) if args else func(self)
+            if op:
+                pre_ops.append(op)
+
+        # build infer
+        infer = StaticInfer(
+            model_dir=self.model_dir,
+            model_prefix=self.MODEL_FILE_PREFIX,
+            option=self.pp_option,
+        )
+
+        # build model specific processor, it's required for a OV model.
+        processor_cfg = self.config["Processor"]
+        tf_key = processor_cfg["type"]
+        func = self._FUNC_MAP[tf_key]
+        processor_cfg.pop("type")
+        args = processor_cfg
+        processor = func(self, **args) if args else func(self)
+
+        return pre_ops, infer, processor
+
+    def process(self, batch_data: List[Any], prompts: Dict[str, Any]):
+        """
+        Process a batch of data through the preprocessing, inference, and postprocessing.
+
+        Args:
+            batch_data (List[str]): A batch of input data (e.g., image file paths).
+            prompt (Dict[str, Any]): Prompt for open vocabulary segmentation.
+
+        Returns:
+            dict: A dictionary containing the input path, raw image, class IDs, scores, and label names
+                for every instance of the batch. Keys include 'input_path', 'input_img', 'class_ids', 'scores', and 'label_names'.
+        """
+        image_paths = batch_data
+        src_images = self.pre_ops[0](batch_data)
+        datas = src_images
+        # preprocess
+        for pre_op in self.pre_ops[1:-1]:
+            datas = pre_op(datas)
+
+        # use Model-specific preprocessor to format batch inputs
+        batch_inputs = self.processor.preprocess(datas, **prompts)
+
+        # do infer
+        batch_preds = self.infer(batch_inputs)
+        
+        # postprocess
+        masks = self.processor.postprocess(batch_preds)
+
+        return {
+            "input_path": image_paths,
+            "input_img": src_images,
+            "prompts": [prompts] * len(image_paths),
+            "masks": masks,
+        }
+
+    @register("SAMProcessor")
+    def build_sam_preprocessor(self, size=1024, mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375]):
+        return SAMProcessor(
+            size=size,
+            img_mean=mean,
+            img_std=std
+        )

+ 15 - 0
paddlex/inference/models_new/open_vocabulary_segmentation/processors/__init__.py

@@ -0,0 +1,15 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .sam_processer import SAMProcessor

+ 232 - 0
paddlex/inference/models_new/open_vocabulary_segmentation/processors/sam_processer.py

@@ -0,0 +1,232 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Dict, List, Optional, Union, Tuple
+
+import numpy as np
+import PIL
+from copy import deepcopy
+
+import paddle
+import paddle.vision.transforms as T
+import paddle.nn.functional as F
+
+def _get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]:
+    """Compute the output size given input size and target long side length.
+    """
+    scale = long_side_length * 1.0 / max(oldh, oldw)
+    newh, neww = oldh * scale, oldw * scale
+    neww = int(neww + 0.5)
+    newh = int(newh + 0.5)
+    return (newh, neww)
+
+class SAMProcessor(object):
+
+    def __init__(
+        self,
+        size: Optional[Union[List[int], int]] = None,
+        image_mean: Union[float, List[float]] = [123.675, 116.28, 103.53],
+        image_std: Union[float, List[float]] = [58.395, 57.12, 57.375],
+        **kwargs,
+    ) -> None:
+
+        size = size if size is not None else 1024
+        self.size = size
+
+        if isinstance(image_mean, float):
+            image_mean = [image_mean] * 3
+        if isinstance(image_std, float):
+            image_std = [image_std] * 3
+
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+        self.image_processor = SamImageProcessor(self.size, self.image_mean, self.image_std)
+        self.prompt_processor = SamPromptProcessor(self.size)
+
+    def preprocess(
+        self,
+        images,
+        *,
+        point_prompt=None,
+        box_prompt=None,
+        **kwargs,
+    ):
+
+        if point_prompt is not None and box_prompt is not None:
+            raise ValueError(
+                "SAM can only use either points or boxes as prompt, not both at the same time."
+            )
+        if point_prompt is None and box_prompt is None:
+            raise ValueError(
+                "SAM must use either points or boxes as prompt, now both is None."
+            )
+
+        point_prompt = np.array(point_prompt).reshape(-1, 2) if point_prompt is not None else None
+        box_prompt = np.array(box_prompt).reshape(-1, 4) if box_prompt is not None else None
+
+        if point_prompt is not None and point_prompt.size > 2:
+            raise ValueError(
+                "SAM now only support one point for using point promot, your input format should be like [[x, y]] only."
+            )
+
+        image_seg = self.image_processor(images)
+        self.original_size = self.image_processor.original_size
+        self.input_size = self.image_processor.input_size
+        prompt = self.prompt_processor(
+            self.original_size,
+            point_coords=point_prompt,
+            box=box_prompt,
+        )
+
+        return image_seg, prompt
+
+    def postprocess(self, low_res_masks, mask_threshold: float = 0.0):
+        
+        if isinstance(low_res_masks, list):
+            assert len(low_res_masks) == 1
+            low_res_masks = low_res_masks[0]
+            
+        masks = F.interpolate(
+            paddle.to_tensor(low_res_masks),
+            (self.size, self.size),
+            mode="bilinear",
+            align_corners=False,
+        )
+        masks = masks[..., : self.input_size[0], : self.input_size[1]]
+        masks = F.interpolate(masks, self.original_size, mode="bilinear", align_corners=False)
+        masks = (masks > mask_threshold).numpy().astype(np.int8)
+
+        return [masks]
+
+
+class SamPromptProcessor(object):
+    """Constructs a Sam prompt processor.
+    """
+
+    def __init__(
+        self,
+        size: int = 1024,
+    ):
+        self.size = size
+
+    def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
+        """Expects a numpy array of length 2 in the final dimension. Requires the
+           original image size in (H, W) format.
+        """
+        old_h, old_w = original_size
+        new_h, new_w = _get_preprocess_shape(original_size[0], original_size[1], self.size)
+        coords = deepcopy(coords).astype(float)
+        coords[..., 0] = coords[..., 0] * (new_w / old_w)
+        coords[..., 1] = coords[..., 1] * (new_h / old_h)
+        return coords
+
+    def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
+        """Expects a numpy array shape Nx4. Requires the original image size
+           in (H, W) format.
+        """
+        boxes = self.apply_coords(boxes.reshape([-1, 2, 2]), original_size)
+        return boxes.reshape([-1, 4])
+
+    def __call__(
+        self,
+        original_size,
+        point_coords=None,
+        box=None,
+        **kwargs,
+    ):
+        if point_coords is not None and box is not None:
+            raise ValueError(
+                "SAM can only use either points or boxes as prompt, not both at the same time."
+            )
+        if point_coords is not None:
+            point_coords = self.apply_coords(point_coords, original_size)
+            point_coords = point_coords[None, ...]
+            return point_coords.astype(np.float32)
+
+        if box is not None:
+            box = self.apply_boxes(box, original_size)
+            return box.astype(np.float32)
+
+class SamImageProcessor(object):
+    """Constructs a Sam image processor.
+    """
+
+    def __init__(
+        self,
+        size: Union[List[int], int] = None,
+        image_mean: Union[float, List[float]] = [0.5, 0.5, 0.5],
+        image_std: Union[float, List[float]] = [0.5, 0.5, 0.5],
+        **kwargs,
+    ) -> None:
+    
+        size = size if size is not None else 1024
+        self.size = size
+
+        if isinstance(image_mean, float):
+            image_mean = [image_mean] * 3
+        if isinstance(image_std, float):
+            image_std = [image_std] * 3
+
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+        self.original_size = None
+        self.input_size = None
+
+    def apply_image(self, image: np.ndarray) -> np.ndarray:
+        """Expects a numpy array with shape HxWxC in uint8 format.
+        """
+        target_size = _get_preprocess_shape(image.shape[0], image.shape[1], self.size)
+        if isinstance(image, np.ndarray):
+            image = PIL.Image.fromarray(image)
+
+        return np.array(T.resize(image, target_size))
+
+    def __call__(self, images, **kwargs):
+        if not isinstance(images, (list, tuple)):
+            images = [images]
+        return self.preprocess(images)
+        
+    def preprocess(
+        self,
+        images,
+    ):
+        """Preprocess an image or a batch of images with a same shape.
+        """
+
+        size = self.size
+
+        input_image = [self.apply_image(image) for image in images]
+
+        input_image_paddle = paddle.to_tensor(input_image).cast("int32")
+
+        input_image_paddle = input_image_paddle.transpose([0, 3, 1, 2])
+
+        original_image_size = images[0].shape[:2]
+
+        self.original_size = original_image_size
+        self.input_size = tuple(input_image_paddle.shape[-2:])
+
+        mean = paddle.to_tensor(self.image_mean).reshape([-1, 1, 1])
+        std = paddle.to_tensor(self.image_std).reshape([-1, 1, 1])
+        input_image_paddle = (input_image_paddle.astype(std.dtype) - mean) / std
+
+        h, w = input_image_paddle.shape[-2:]
+        padh = self.size - h
+        padw = self.size - w
+        input_image = F.pad(input_image_paddle, (0, padw, 0, padh))
+
+        return input_image.numpy()

+ 15 - 0
paddlex/inference/models_new/open_vocabulary_segmentation/results/__init__.py

@@ -0,0 +1,15 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .sam_result import SAMSegResult

+ 138 - 0
paddlex/inference/models_new/open_vocabulary_segmentation/results/sam_result.py

@@ -0,0 +1,138 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import cv2
+import numpy as np
+import copy, random
+import PIL
+from PIL import Image, ImageDraw, ImageFont
+
+from ....utils.color_map import get_colormap
+from ....common.result import BaseCVResult
+
+def draw_segm(im, masks, mask_info, alpha=0.7):
+    """
+    Draw segmentation on image
+    """
+    mask_color_id = 0
+    w_ratio = 0.4
+    color_list = get_colormap(rgb=True)
+    im = np.array(im).astype("float32")
+    clsid2color = {}
+    masks = np.array(masks)
+    masks = masks.astype(np.uint8)
+    for i in range(masks.shape[0]):
+        mask, clsid = masks[i], mask_info[i]["class_id"]
+
+        if clsid not in clsid2color:
+            color_index = i % len(color_list)
+            clsid2color[clsid] = color_list[color_index]
+        color_mask = clsid2color[clsid]
+        for c in range(3):
+            color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255
+        idx = np.nonzero(mask)
+        color_mask = np.array(color_mask)
+        idx0 = np.minimum(idx[0], im.shape[0] - 1)
+        idx1 = np.minimum(idx[1], im.shape[1] - 1)
+        im[idx0, idx1, :] *= 1.0 - alpha
+        im[idx0, idx1, :] += alpha * color_mask
+        # draw box prompt
+        if mask_info[i]["label"] == "box_prompt":
+            x0, y0, x1, y1 = mask_info[i]["prompt"]
+            x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+            cv2.rectangle(
+                im, (x0, y0), (x1, y1), tuple(color_mask.astype("int32").tolist()), 1
+            )
+            bbox_text = "%s" % mask_info[i]["label"]
+            t_size = cv2.getTextSize(bbox_text, 0, 0.3, thickness=1)[0]
+            cv2.rectangle(
+                im,
+                (x0, y0),
+                (x0 + t_size[0], y0 - t_size[1] - 3),
+                tuple(color_mask.astype("int32").tolist()),
+                -1,
+            )
+            cv2.putText(
+                im,
+                bbox_text,
+                (x0, y0 - 2),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.3,
+                (0, 0, 0),
+                1,
+                lineType=cv2.LINE_AA,
+            )
+        elif mask_info[i]["label"] == "point_prompt":
+            x, y = mask_info[i]["prompt"]
+            bbox_text = "%s" % mask_info[i]["label"]
+            t_size = cv2.getTextSize(bbox_text, 0, 0.3, thickness=1)[0]
+            cv2.circle(
+                im,
+                (x, y),
+                1,
+                (255, 255, 255),
+                4,
+            )
+            cv2.putText(
+                im,
+                bbox_text,
+                (x - t_size[0] // 2, y - t_size[1] - 1),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.3,
+                (255, 255, 255),
+                1,
+                lineType=cv2.LINE_AA,
+            )
+        else:
+            raise NotImplementedError(
+                f"Prompt type {mask_info[i]['label']} not implemented."
+            )
+    return Image.fromarray(im.astype("uint8"))
+
+class SAMSegResult(BaseCVResult):
+    """Save Result Transform for SAM"""
+    def __init__(self, data: dict) -> None:
+
+        data["masks"] = [mask.squeeze(0) for mask in list(data["masks"])]
+        
+        prompts = data['prompts']
+        assert isinstance(prompts, dict) and len(prompts) == 1
+        prompt_type, prompts = list(prompts.items())[0]
+        mask_infos = [
+            {
+                "label": prompt_type,
+                "class_id": random.randint(0, len(get_colormap(rgb=True)) - 1),
+                "prompt": p,
+            }
+            for p in prompts
+        ]
+        data["mask_infos"] = mask_infos
+        assert len(data["masks"]) == len(mask_infos)
+
+        super().__init__(data)
+
+    def _to_img(self):
+        """apply"""
+        image = Image.fromarray(self["input_img"])
+        mask_infos = self["mask_infos"]
+        masks = self["masks"]
+        image = draw_segm(image, masks, mask_infos)
+
+        return image
+
+    def _to_str(self, _, *args, **kwargs):
+        data = copy.deepcopy(self)
+        data["masks"] = "..."
+        return super()._to_str(data, *args, **kwargs)

+ 1 - 0
paddlex/inference/utils/new_ir_blacklist.py

@@ -23,4 +23,5 @@ NEWIR_BLOCKLIST = [
     "Co-DINO-R50",
     "Co-DINO-Swin-L",
     "LaTeX_OCR_rec",
+    "GroundingDINO-T",
 ]

+ 3 - 0
paddlex/inference/utils/official_models.py

@@ -321,6 +321,9 @@ PP-LCNet_x1_0_vehicle_attribute_infer.tar",
     "YOWO": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0b2/YOWO_infer.tar",
     "PP-TinyPose_128x96": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0rc0/PP-TinyPose_128x96_infer.tar",
     "PP-TinyPose_256x192": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0rc0/PP-TinyPose_256x192_infer.tar",
+    "GroundingDINO-T": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0rc0/GroundingDINO-T_infer.tar",
+    "SAM-H_box": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0rc0/SAM-H_box_infer.tar",
+    "SAM-H_point": "https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0rc0/SAM-H_point_infer.tar",
 }
 
 

+ 13 - 0
paddlex/modules/open_vocabulary_detection/__init__.py

@@ -0,0 +1,13 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

+ 18 - 0
paddlex/modules/open_vocabulary_detection/model_list.py

@@ -0,0 +1,18 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+MODELS = [
+    "GroundingDINO-T",
+]

+ 13 - 0
paddlex/modules/open_vocabulary_segmentation/__init__.py

@@ -0,0 +1,13 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

+ 19 - 0
paddlex/modules/open_vocabulary_segmentation/model_list.py

@@ -0,0 +1,19 @@
+# copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+MODELS = [
+    "SAM-H_point",
+    "SAM-H_box",
+]