// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include "fast_tokenizer/tokenizers/ernie_fast_tokenizer.h" #include "ultra_infer/ultra_infer_model.h" #include "ultra_infer/utils/unique_ptr.h" #include #include #include #include #include using namespace paddlenlp; namespace ultra_infer { namespace text { struct ULTRAINFER_DECL UIEResult { size_t start_; size_t end_; double probability_; std::string text_; std::unordered_map> relation_; UIEResult() = default; UIEResult(size_t start, size_t end, double probability, std::string text) : start_(start), end_(end), probability_(probability), text_(text) {} std::string Str() const; }; ULTRAINFER_DECL std::ostream &operator<<(std::ostream &os, const UIEResult &result); ULTRAINFER_DECL std::ostream &operator<<( std::ostream &os, const std::vector>> &results); struct ULTRAINFER_DECL SchemaNode { std::string name_; std::vector> prefix_; std::vector> relations_; std::vector children_; SchemaNode() = default; SchemaNode(const SchemaNode &) = default; explicit SchemaNode(const std::string &name, const std::vector &children = {}) : name_(name), children_(children) {} void AddChild(const std::string &schema) { children_.emplace_back(schema); } void AddChild(const SchemaNode &schema) { children_.push_back(schema); } void AddChild(const std::string &schema, const std::vector &children) { SchemaNode schema_node(schema); for (auto &child : children) { schema_node.children_.emplace_back(child); } children_.emplace_back(schema_node); } void AddChild(const std::string &schema, const std::vector &children) { SchemaNode schema_node(schema); schema_node.children_ = children; children_.emplace_back(schema_node); } }; enum SchemaLanguage { ZH, // Chinese EN // English }; struct Schema { explicit Schema(const std::string &schema, const std::string &name = "root"); explicit Schema(const std::vector &schema_list, const std::string &name = "root"); explicit Schema(const std::vector &schema_list, const std::string &name = "root"); explicit Schema(const SchemaNode &schema, const std::string &name = "root"); private: void CreateRoot(const std::string &name); std::unique_ptr root_; friend class UIEModel; }; struct ULTRAINFER_DECL UIEModel : public UltraInferModel { public: UIEModel(const std::string &model_file, const std::string ¶ms_file, const std::string &vocab_file, float position_prob, size_t max_length, const std::vector &schema, int batch_size, const ultra_infer::RuntimeOption &custom_option = ultra_infer::RuntimeOption(), const ultra_infer::ModelFormat &model_format = ultra_infer::ModelFormat::PADDLE, SchemaLanguage schema_language = SchemaLanguage::ZH); UIEModel(const std::string &model_file, const std::string ¶ms_file, const std::string &vocab_file, float position_prob, size_t max_length, const SchemaNode &schema, int batch_size, const ultra_infer::RuntimeOption &custom_option = ultra_infer::RuntimeOption(), const ultra_infer::ModelFormat &model_format = ultra_infer::ModelFormat::PADDLE, SchemaLanguage schema_language = SchemaLanguage::ZH); UIEModel(const std::string &model_file, const std::string ¶ms_file, const std::string &vocab_file, float position_prob, size_t max_length, const std::vector &schema, int batch_size, const ultra_infer::RuntimeOption &custom_option = ultra_infer::RuntimeOption(), const ultra_infer::ModelFormat &model_format = ultra_infer::ModelFormat::PADDLE, SchemaLanguage schema_language = SchemaLanguage::ZH); virtual std::string ModelName() const { return "UIEModel"; } void SetSchema(const std::vector &schema); void SetSchema(const std::vector &schema); void SetSchema(const SchemaNode &schema); bool ConstructTextsAndPrompts( const std::vector &raw_texts, const std::string &node_name, const std::vector> node_prefix, std::vector *input_texts, std::vector *prompts, std::vector> *input_mapping_with_raw_texts, std::vector> *input_mapping_with_short_text); void Preprocess(const std::vector &input_texts, const std::vector &prompts, std::vector *encodings, std::vector *inputs); void Postprocess( const std::vector &outputs, const std::vector &encodings, const std::vector &short_input_texts, const std::vector &short_prompts, const std::vector> &input_mapping_with_short_text, std::vector> *results); void ConstructChildPromptPrefix( const std::vector> &input_mapping_with_raw_texts, const std::vector> &results_list, std::vector> *prefix); void ConstructChildRelations( const std::vector> &old_relations, const std::vector> &input_mapping_with_raw_texts, const std::vector> &results_list, const std::string &node_name, std::vector>> *results, std::vector> *new_relations); void Predict(const std::vector &texts, std::vector>> *results); protected: using IDX_PROB = std::pair; struct IdxProbCmp { bool operator()(const std::pair &lhs, const std::pair &rhs) const; }; using SPAN_SET = std::set, IdxProbCmp>; struct SpanIdx { fast_tokenizer::core::Offset offset_; bool is_prompt_; }; void SetValidBackend(); bool Initialize(); void AutoSplitter(const std::vector &texts, size_t max_length, std::vector *short_texts, std::vector> *input_mapping); void AutoJoiner(const std::vector &short_texts, const std::vector> &input_mapping, std::vector> *results); // Get idx of the last dimension in probability arrays, which is greater than // a limitation. void GetCandidateIdx(const float *probs, int64_t batch_size, int64_t seq_len, std::vector> *candidate_idx_prob, float threshold = 0.5) const; void GetSpan(const std::vector &start_idx_prob, const std::vector &end_idx_prob, SPAN_SET *span_set) const; void GetSpanIdxAndProbs( const SPAN_SET &span_set, const std::vector &offset_mapping, std::vector *span_idxs, std::vector *probs) const; void ConvertSpanToUIEResult(const std::vector &texts, const std::vector &prompts, const std::vector> &span_idxs, const std::vector> &probs, std::vector> *results) const; std::unique_ptr schema_; size_t max_length_; float position_prob_; int batch_size_; SchemaLanguage schema_language_; fast_tokenizer::tokenizers_impl::ErnieFastTokenizer tokenizer_; }; } // namespace text } // namespace ultra_infer