| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463 |
- """Make approximate assertions as "expectations" on test results.
- This module is designed to be used within test cases decorated with the
- `@pytest.mark.decorator` decorator
- It allows you to log scores about a test case and optionally make assertions that log as
- "expectation" feedback to LangSmith.
- Example:
- ```python
- import pytest
- from langsmith import expect
- @pytest.mark.langsmith
- def test_output_semantically_close():
- response = oai_client.chat.completions.create(
- model="gpt-3.5-turbo",
- messages=[
- {"role": "system", "content": "You are a helpful assistant."},
- {"role": "user", "content": "Say hello!"},
- ],
- )
- response_txt = response.choices[0].message.content
- # Intended usage
- expect.embedding_distance(
- prediction=response_txt,
- reference="Hello!",
- ).to_be_less_than(0.9)
- # Score the test case
- matcher = expect.edit_distance(
- prediction=response_txt,
- reference="Hello!",
- )
- # Apply an assertion and log 'expectation' feedback to LangSmith
- matcher.to_be_less_than(1)
- # You can also directly make assertions on values directly
- expect.value(response_txt).to_contain("Hello!")
- # Or using a custom check
- expect.value(response_txt).against(lambda x: "Hello" in x)
- # You can even use this for basic metric logging within tests
- expect.score(0.8)
- expect.score(0.7, key="similarity").to_be_greater_than(0.7)
- ```
- """ # noqa: E501
- from __future__ import annotations
- import atexit
- import inspect
- from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Literal,
- Optional,
- Union,
- overload,
- )
- from langsmith import client as ls_client
- from langsmith import run_helpers as rh
- from langsmith import run_trees as rt
- from langsmith import utils as ls_utils
- if TYPE_CHECKING:
- from langsmith._internal._edit_distance import EditDistanceConfig
- from langsmith._internal._embedding_distance import EmbeddingConfig
- # Sentinel class used until PEP 0661 is accepted
- class _NULL_SENTRY:
- """A sentinel singleton class used to distinguish omitted keyword arguments
- from those passed in with the value None (which may have different behavior).
- """ # noqa: D205
- def __bool__(self) -> Literal[False]:
- return False
- def __repr__(self) -> str:
- return "NOT_GIVEN"
- NOT_GIVEN = _NULL_SENTRY()
- class _Matcher:
- """A class for making assertions on expectation values."""
- def __init__(
- self,
- client: Optional[ls_client.Client],
- key: str,
- value: Any,
- _executor: Optional[ls_utils.ContextThreadPoolExecutor] = None,
- run_id: Optional[str] = None,
- ):
- self._client = client
- self.key = key
- self.value = value
- self._executor = _executor or ls_utils.ContextThreadPoolExecutor(max_workers=3)
- rt = rh.get_current_run_tree()
- self._run_id = rt.trace_id if rt else run_id
- def _submit_feedback(self, score: int, message: Optional[str] = None) -> None:
- if not ls_utils.test_tracking_is_disabled():
- if not self._client:
- self._client = rt.get_cached_client()
- self._executor.submit(
- self._client.create_feedback,
- run_id=self._run_id,
- key="expectation",
- score=score,
- comment=message,
- )
- def _assert(self, condition: bool, message: str, method_name: str) -> None:
- try:
- assert condition, message
- self._submit_feedback(1, message=f"Success: {self.key}.{method_name}")
- except AssertionError as e:
- self._submit_feedback(0, repr(e))
- raise e from None
- def to_be_less_than(self, value: float) -> None:
- """Assert that the expectation value is less than the given value.
- Args:
- value: The value to compare against.
- Raises:
- AssertionError: If the expectation value is not less than the given value.
- """
- self._assert(
- self.value < value,
- f"Expected {self.key} to be less than {value}, but got {self.value}",
- "to_be_less_than",
- )
- def to_be_greater_than(self, value: float) -> None:
- """Assert that the expectation value is greater than the given value.
- Args:
- value: The value to compare against.
- Raises:
- AssertionError: If the expectation value is not
- greater than the given value.
- """
- self._assert(
- self.value > value,
- f"Expected {self.key} to be greater than {value}, but got {self.value}",
- "to_be_greater_than",
- )
- def to_be_between(self, min_value: float, max_value: float) -> None:
- """Assert that the expectation value is between the given min and max values.
- Args:
- min_value: The minimum value (exclusive).
- max_value: The maximum value (exclusive).
- Raises:
- AssertionError: If the expectation value is not between the min and max.
- """
- self._assert(
- min_value < self.value < max_value,
- f"Expected {self.key} to be between {min_value} and {max_value},"
- f" but got {self.value}",
- "to_be_between",
- )
- def to_be_approximately(self, value: float, precision: int = 2) -> None:
- """Assert that the expectation value is approximately equal to the given value.
- Args:
- value: The value to compare against.
- precision: The number of decimal places to round to for comparison.
- Raises:
- AssertionError: If the rounded expectation value
- does not equal the rounded given value.
- """
- self._assert(
- round(self.value, precision) == round(value, precision),
- f"Expected {self.key} to be approximately {value}, but got {self.value}",
- "to_be_approximately",
- )
- def to_equal(self, value: float) -> None:
- """Assert that the expectation value equals the given value.
- Args:
- value: The value to compare against.
- Raises:
- AssertionError: If the expectation value does
- not exactly equal the given value.
- """
- self._assert(
- self.value == value,
- f"Expected {self.key} to be equal to {value}, but got {self.value}",
- "to_equal",
- )
- def to_be_none(self) -> None:
- """Assert that the expectation value is `None`.
- Raises:
- AssertionError: If the expectation value is not `None`.
- """
- self._assert(
- self.value is None,
- f"Expected {self.key} to be None, but got {self.value}",
- "to_be_none",
- )
- def to_contain(self, value: Any) -> None:
- """Assert that the expectation value contains the given value.
- Args:
- value: The value to check for containment.
- Raises:
- AssertionError: If the expectation value does not contain the given value.
- """
- self._assert(
- value in self.value,
- f"Expected {self.key} to contain {value}, but it does not",
- "to_contain",
- )
- # Custom assertions
- def against(self, func: Callable, /) -> None:
- """Assert the expectation value against a custom function.
- Args:
- func: A custom function that takes the expectation value as input.
- Raises:
- AssertionError: If the custom function returns False.
- """
- func_signature = inspect.signature(func)
- self._assert(
- func(self.value),
- f"Assertion {func_signature} failed for {self.key}",
- "against",
- )
- class _Expect:
- """A class for setting expectations on test results."""
- def __init__(self, *, client: Optional[ls_client.Client] = None):
- self._client = client
- self.executor = ls_utils.ContextThreadPoolExecutor(max_workers=3)
- atexit.register(self.executor.shutdown, wait=True)
- def embedding_distance(
- self,
- prediction: str,
- reference: str,
- *,
- config: Optional[EmbeddingConfig] = None,
- ) -> _Matcher:
- """Compute the embedding distance between the prediction and reference.
- This logs the embedding distance to LangSmith and returns a `_Matcher` instance
- for making assertions on the distance value.
- By default, this uses the OpenAI API for computing embeddings.
- Args:
- prediction: The predicted string to compare.
- reference: The reference string to compare against.
- config: Optional configuration for the embedding distance evaluator.
- Supported options:
- - `encoder`: A custom encoder function to encode the list of input
- strings to embeddings.
- Defaults to the OpenAI API.
- - `metric`: The distance metric to use for comparison.
- Supported values: `'cosine'`, `'euclidean'`, `'manhattan'`,
- `'chebyshev'`, `'hamming'`.
- Returns:
- A `_Matcher` instance for the embedding distance value.
- Example:
- ```python
- expect.embedding_distance(
- prediction="hello",
- reference="hi",
- ).to_be_less_than(1.0)
- ```
- """ # noqa: E501
- from langsmith._internal._embedding_distance import EmbeddingDistance
- config = config or {}
- encoder_func = "custom" if config.get("encoder") else "openai"
- evaluator = EmbeddingDistance(config=config)
- score = evaluator.evaluate(prediction=prediction, reference=reference)
- src_info = {"encoder": encoder_func, "metric": evaluator.distance}
- self._submit_feedback(
- "embedding_distance",
- {
- "score": score,
- "source_info": src_info,
- "comment": f"Using {encoder_func}, Metric: {evaluator.distance}",
- },
- )
- return _Matcher(
- self._client, "embedding_distance", score, _executor=self.executor
- )
- def edit_distance(
- self,
- prediction: str,
- reference: str,
- *,
- config: Optional[EditDistanceConfig] = None,
- ) -> _Matcher:
- """Compute the string distance between the prediction and reference.
- This logs the string distance (Damerau-Levenshtein) to LangSmith and returns
- a `_Matcher` instance for making assertions on the distance value.
- This depends on the `rapidfuzz` package for string distance computation.
- Args:
- prediction: The predicted string to compare.
- reference: The reference string to compare against.
- config: Optional configuration for the string distance evaluator.
- Supported options:
- - `metric`: The distance metric to use for comparison.
- Supported values: `'damerau_levenshtein'`, `'levenshtein'`,
- `'jaro'`, `'jaro_winkler'`, `'hamming'`, `'indel'`.
- - `normalize_score`: Whether to normalize the score between `0` and `1`.
- Returns:
- A `_Matcher` instance for the string distance value.
- Examples:
- ```python
- expect.edit_distance("hello", "helo").to_be_less_than(1)
- ```
- """
- from langsmith._internal._edit_distance import EditDistance
- config = config or {}
- metric = config.get("metric") or "damerau_levenshtein"
- normalize = config.get("normalize_score", True)
- evaluator = EditDistance(config=config)
- score = evaluator.evaluate(prediction=prediction, reference=reference)
- src_info = {"metric": metric, "normalize": normalize}
- self._submit_feedback(
- "edit_distance",
- {
- "score": score,
- "source_info": src_info,
- "comment": f"Using {metric}, Normalize: {normalize}",
- },
- )
- return _Matcher(
- self._client,
- "edit_distance",
- score,
- _executor=self.executor,
- )
- def value(self, value: Any) -> _Matcher:
- """Create a `_Matcher` instance for making assertions on the given value.
- Args:
- value: The value to make assertions on.
- Returns:
- A `_Matcher` instance for the given value.
- Example:
- ```python
- expect.value(10).to_be_less_than(20)
- ```
- """
- return _Matcher(self._client, "value", value, _executor=self.executor)
- def score(
- self,
- score: Union[float, int, bool],
- *,
- key: str = "score",
- source_run_id: Optional[ls_client.ID_TYPE] = None,
- comment: Optional[str] = None,
- ) -> _Matcher:
- """Log a numeric score to LangSmith.
- Args:
- score: The score value to log.
- key: The key to use for logging the score. Defaults to `'score'`.
- Example:
- ```python
- expect.score(0.8) # doctest: +ELLIPSIS
- <langsmith._expect._Matcher object at ...>
- expect.score(0.8, key="similarity").to_be_greater_than(0.7)
- ```
- """
- self._submit_feedback(
- key,
- {
- "score": score,
- "source_info": {"method": "expect.score"},
- "source_run_id": source_run_id,
- "comment": comment,
- },
- )
- return _Matcher(self._client, key, score, _executor=self.executor)
- ## Private Methods
- @overload
- def __call__(self, value: Any, /) -> _Matcher: ...
- @overload
- def __call__(self, /, *, client: ls_client.Client) -> _Expect: ...
- def __call__(
- self,
- value: Optional[Any] = NOT_GIVEN,
- /,
- client: Optional[ls_client.Client] = None,
- ) -> Union[_Expect, _Matcher]:
- expected = _Expect(client=client)
- if value is not NOT_GIVEN:
- return expected.value(value)
- return expected
- def _submit_feedback(self, key: str, results: dict):
- current_run = rh.get_current_run_tree()
- run_id = current_run.trace_id if current_run else None
- if not ls_utils.test_tracking_is_disabled():
- if not self._client:
- self._client = rt.get_cached_client()
- self.executor.submit(
- self._client.create_feedback, run_id=run_id, key=key, **results
- )
- expect = _Expect()
- __all__ = ["expect"]
|