qianfan_bot_retriever.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. # copyright (c) 2024 PaddlePaddle Authors. All Rights Reserve.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. from operator import le
  15. from typing import Dict, List
  16. import json
  17. import requests
  18. from langchain_core.embeddings import Embeddings
  19. from paddlex.utils import logging
  20. from .base import BaseRetriever
  21. class QianFanBotRetriever(BaseRetriever):
  22. """QianFan Bot Retriever"""
  23. entities = [
  24. "qianfan",
  25. ]
  26. MODELS = [
  27. "tao-8k",
  28. "embedding-v1",
  29. "bge-large-zh",
  30. "bge-large-en",
  31. ]
  32. def __init__(self, config: Dict) -> None:
  33. """
  34. Initializes the ErnieBotRetriever instance with the provided configuration.
  35. Args:
  36. config (Dict): A dictionary containing configuration settings.
  37. - model_name (str): The name of the model to use.
  38. - api_type (str): The type of API to use ('qianfan' or 'openai').
  39. - api_key (str): The API key for 'qianfan' API.
  40. - base_url (str): The base URL for 'qianfan' API.
  41. Raises:
  42. ValueError: If api_type is not one of ['qianfan','openai'],
  43. base_url is None for api_type is qianfan,
  44. api_key is None for api_type is qianfan.
  45. """
  46. super().__init__()
  47. model_name = config.get("model_name", None)
  48. api_key = config.get("api_key", None)
  49. base_url = config.get("base_url", None)
  50. if model_name not in self.MODELS:
  51. raise ValueError(
  52. f"model_name must be in {self.MODELS} of QianFanBotRetriever."
  53. )
  54. if api_key is None:
  55. raise ValueError("api_key cannot be empty when api_type is qianfan.")
  56. if base_url is None:
  57. raise ValueError("base_url cannot be empty when api_type is qianfan.")
  58. self.embedding = QianfanEmbeddings(
  59. model=model_name,
  60. base_url=base_url,
  61. api_key=api_key,
  62. )
  63. self.model_name = model_name
  64. self.config = config
  65. class QianfanEmbeddings(Embeddings):
  66. """`Baidu Qianfan Embeddings` embedding models."""
  67. def __init__(
  68. self,
  69. api_key: str,
  70. base_url: str = "https://qianfan.baidubce.com/v2",
  71. model: str = "embedding-v1",
  72. **kwargs,
  73. ):
  74. """
  75. Initialize the Baidu Qianfan Embeddings class.
  76. Args:
  77. api_key (str): The Qianfan API key.
  78. base_url (str): The base URL for 'qianfan' API.
  79. model (str): Model name. Default is "embedding-v1",select in ["tao-8k","embedding-v1","bge-large-en","bge-large-zh"].
  80. kwargs (dict): Additional keyword arguments passed to the base Embeddings class.
  81. """
  82. super().__init__(**kwargs)
  83. chunk_size_map = {
  84. "tao-8k": 1,
  85. "embedding-v1": 16,
  86. "bge-large-en": 16,
  87. "bge-large-zh": 16,
  88. }
  89. self.api_key = api_key
  90. self.base_url = base_url
  91. self.model = model
  92. self.chunk_size = chunk_size_map.get(model, 1)
  93. def embed(self, texts: str, **kwargs) -> List[float]:
  94. url = f"{self.base_url}/embeddings"
  95. payload = json.dumps(
  96. {"model": kwargs.get("model", self.model), "input": [f"{texts}"]}
  97. )
  98. headers = {
  99. "Content-Type": "application/json",
  100. "Authorization": f"Bearer {self.api_key}",
  101. }
  102. response = requests.request("POST", url, headers=headers, data=payload)
  103. if response.status_code != 200:
  104. logging.error(
  105. f"Failed to call Qianfan API. Status code: {response.status_code}, Response content: {response}"
  106. )
  107. return response.json()
  108. def embed_query(self, text: str) -> List[float]:
  109. resp = self.embed_documents([text])
  110. return resp[0]
  111. def embed_documents(self, texts: List[str]) -> List[List[float]]:
  112. """
  113. Embeds a list of text documents using the AutoVOT algorithm.
  114. Args:
  115. texts (List[str]): A list of text documents to embed.
  116. Returns:
  117. List[List[float]]: A list of embeddings for each document in the input list.
  118. Each embedding is represented as a list of float values.
  119. """
  120. lst = []
  121. for chunk in texts:
  122. resp = self.embed(texts=chunk)
  123. lst.extend([res["embedding"] for res in resp["data"]])
  124. return lst
  125. async def aembed_query(self, text: str) -> List[float]:
  126. embeddings = await self.aembed_documents([text])
  127. return embeddings[0]
  128. async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
  129. lst = []
  130. for chunk in texts:
  131. resp = await self.embed(texts=chunk)
  132. for res in resp["data"]:
  133. lst.extend([res["embedding"]])
  134. return lst