evaluator.py 37 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022
  1. """This module contains the evaluator classes for evaluating runs."""
  2. from __future__ import annotations
  3. import asyncio
  4. import inspect
  5. import uuid
  6. from abc import abstractmethod
  7. from collections.abc import Awaitable, Sequence
  8. from typing import (
  9. Any,
  10. Callable,
  11. Literal,
  12. Optional,
  13. Union,
  14. cast,
  15. )
  16. from typing_extensions import TypedDict
  17. from langsmith import run_helpers as rh
  18. from langsmith import schemas
  19. try:
  20. from pydantic.v1 import ( # type: ignore[import]
  21. BaseModel,
  22. Field,
  23. ValidationError,
  24. validator,
  25. )
  26. except ImportError:
  27. from pydantic import ( # type: ignore[assignment]
  28. BaseModel,
  29. Field,
  30. ValidationError,
  31. validator,
  32. )
  33. import logging
  34. from functools import wraps
  35. from langsmith.schemas import SCORE_TYPE, VALUE_TYPE, Example, Run
  36. logger = logging.getLogger(__name__)
  37. class Category(TypedDict):
  38. """A category for categorical feedback."""
  39. value: Optional[Union[float, int]]
  40. """The numeric score/ordinal corresponding to this category."""
  41. label: str
  42. """The label for this category."""
  43. class FeedbackConfig(TypedDict, total=False):
  44. """Configuration to define a type of feedback.
  45. Applied on on the first creation of a `feedback_key`.
  46. """
  47. type: Literal["continuous", "categorical", "freeform"]
  48. """The type of feedback."""
  49. min: Optional[Union[float, int]]
  50. """The minimum permitted value (if continuous type)."""
  51. max: Optional[Union[float, int]]
  52. """The maximum value permitted value (if continuous type)."""
  53. categories: Optional[list[Union[Category, dict]]]
  54. class EvaluationResult(BaseModel):
  55. """Evaluation result."""
  56. key: str
  57. """The aspect, metric name, or label for this evaluation."""
  58. score: SCORE_TYPE = None
  59. """The numeric score for this evaluation."""
  60. value: VALUE_TYPE = None
  61. """The value for this evaluation, if not numeric."""
  62. comment: Optional[str] = None
  63. """An explanation regarding the evaluation."""
  64. correction: Optional[dict] = None
  65. """What the correct value should be, if applicable."""
  66. evaluator_info: dict = Field(default_factory=dict)
  67. """Additional information about the evaluator."""
  68. feedback_config: Optional[Union[FeedbackConfig, dict]] = None
  69. """The configuration used to generate this feedback."""
  70. source_run_id: Optional[Union[uuid.UUID, str]] = None
  71. """The ID of the trace of the evaluator itself."""
  72. target_run_id: Optional[Union[uuid.UUID, str]] = None
  73. """The ID of the trace this evaluation is applied to.
  74. If none provided, the evaluation feedback is applied to the
  75. root trace being."""
  76. extra: Optional[dict] = None
  77. """Metadata for the evaluator run."""
  78. class Config:
  79. """Pydantic model configuration."""
  80. allow_extra = False
  81. @validator("value", pre=True)
  82. def check_value_non_numeric(cls, v, values):
  83. """Check that the value is not numeric."""
  84. # If a score isn't provided and the value is numeric
  85. # it's more likely the user intended use the score field
  86. if "score" not in values or values["score"] is None:
  87. if isinstance(v, (int, float)):
  88. logger.warning(
  89. "Numeric values should be provided in"
  90. " the 'score' field, not 'value'."
  91. f" Got: {v}"
  92. )
  93. return v
  94. class EvaluationResults(TypedDict, total=False):
  95. """Batch evaluation results.
  96. This makes it easy for your evaluator to return multiple
  97. metrics at once.
  98. """
  99. results: list[EvaluationResult]
  100. """The evaluation results."""
  101. class RunEvaluator:
  102. """Evaluator interface class."""
  103. @abstractmethod
  104. def evaluate_run(
  105. self,
  106. run: Run,
  107. example: Optional[Example] = None,
  108. evaluator_run_id: Optional[uuid.UUID] = None,
  109. ) -> Union[EvaluationResult, EvaluationResults]:
  110. """Evaluate an example."""
  111. async def aevaluate_run(
  112. self,
  113. run: Run,
  114. example: Optional[Example] = None,
  115. evaluator_run_id: Optional[uuid.UUID] = None,
  116. ) -> Union[EvaluationResult, EvaluationResults]:
  117. """Evaluate an example asynchronously."""
  118. current_context = rh.get_tracing_context()
  119. def _run_with_context():
  120. with rh.tracing_context(**current_context):
  121. return self.evaluate_run(run, example, evaluator_run_id)
  122. return await asyncio.get_running_loop().run_in_executor(None, _run_with_context)
  123. _RUNNABLE_OUTPUT = Union[EvaluationResult, EvaluationResults, dict]
  124. class ComparisonEvaluationResult(BaseModel):
  125. """Feedback scores for the results of comparative evaluations.
  126. These are generated by functions that compare two or more runs,
  127. returning a ranking or other feedback.
  128. """
  129. key: str
  130. """The aspect, metric name, or label for this evaluation."""
  131. scores: dict[Union[uuid.UUID, str], SCORE_TYPE]
  132. """The scores for each run in the comparison."""
  133. source_run_id: Optional[Union[uuid.UUID, str]] = None
  134. """The ID of the trace of the evaluator itself."""
  135. comment: Optional[Union[str, dict[Union[uuid.UUID, str], str]]] = None
  136. """Comment for the scores. If a string, it's shared across all target runs.
  137. If a `dict`, it maps run IDs to individual comments.
  138. """
  139. _COMPARISON_OUTPUT = Union[ComparisonEvaluationResult, dict]
  140. class DynamicRunEvaluator(RunEvaluator):
  141. """A dynamic evaluator that wraps a function and transforms it into a `RunEvaluator`.
  142. This class is designed to be used with the `@run_evaluator` decorator, allowing
  143. functions that take a `Run` and an optional `Example` as arguments, and return
  144. an `EvaluationResult` or `EvaluationResults`, to be used as instances of `RunEvaluator`.
  145. Attributes:
  146. func (Callable): The function that is wrapped by this evaluator.
  147. """ # noqa: E501
  148. def __init__(
  149. self,
  150. func: Callable[
  151. [Run, Optional[Example]],
  152. Union[_RUNNABLE_OUTPUT, Awaitable[_RUNNABLE_OUTPUT]],
  153. ],
  154. # Async function to be used for async evaluation. Optional
  155. afunc: Optional[
  156. Callable[
  157. [Run, Optional[Example]],
  158. Awaitable[_RUNNABLE_OUTPUT],
  159. ]
  160. ] = None,
  161. ):
  162. """Initialize the `DynamicRunEvaluator` with a given function.
  163. Args:
  164. func (Callable): A function that takes a `Run` and an optional `Example` as
  165. arguments, and returns a dict or `ComparisonEvaluationResult`.
  166. """
  167. (func, prepare_inputs) = _normalize_evaluator_func(func)
  168. if afunc:
  169. (afunc, prepare_inputs) = _normalize_evaluator_func(afunc) # type: ignore[assignment]
  170. def process_inputs(inputs: dict) -> dict:
  171. if prepare_inputs is None:
  172. return inputs
  173. (_, _, traced_inputs) = prepare_inputs(
  174. inputs.get("run"), inputs.get("example")
  175. )
  176. return traced_inputs
  177. wraps(func)(self)
  178. from langsmith import run_helpers # type: ignore
  179. if afunc is not None:
  180. self.afunc = run_helpers.ensure_traceable(
  181. afunc, process_inputs=process_inputs
  182. )
  183. self._name = getattr(afunc, "__name__", "DynamicRunEvaluator")
  184. if inspect.iscoroutinefunction(func):
  185. if afunc is not None:
  186. raise TypeError(
  187. "Func was provided as a coroutine function, but afunc was "
  188. "also provided. If providing both, func should be a regular "
  189. "function to avoid ambiguity."
  190. )
  191. self.afunc = run_helpers.ensure_traceable(
  192. func, process_inputs=process_inputs
  193. )
  194. self._name = getattr(func, "__name__", "DynamicRunEvaluator")
  195. else:
  196. self.func = run_helpers.ensure_traceable(
  197. cast(Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT], func),
  198. process_inputs=process_inputs,
  199. )
  200. self._name = getattr(func, "__name__", "DynamicRunEvaluator")
  201. def _coerce_evaluation_result(
  202. self,
  203. result: Union[EvaluationResult, dict],
  204. source_run_id: uuid.UUID,
  205. allow_no_key: bool = False,
  206. ) -> EvaluationResult:
  207. if isinstance(result, EvaluationResult):
  208. if not result.source_run_id:
  209. result.source_run_id = source_run_id
  210. return result
  211. try:
  212. if not result:
  213. raise ValueError(
  214. "Expected an EvaluationResult object, or dict with a metric"
  215. f" 'key' and optional 'score'; got empty result: {result}"
  216. )
  217. if "key" not in result and allow_no_key:
  218. result["key"] = self._name
  219. if all(k not in result for k in ("score", "value", "comment")):
  220. raise ValueError(
  221. "Expected an EvaluationResult object, or dict with a metric"
  222. f" 'key' and optional 'score' or categorical 'value'; got {result}"
  223. )
  224. return EvaluationResult(**{"source_run_id": source_run_id, **result})
  225. except ValidationError as e:
  226. raise ValueError(
  227. "Expected an EvaluationResult object, or dict with a metric"
  228. f" 'key' and optional 'score'; got {result}"
  229. ) from e
  230. def _coerce_evaluation_results(
  231. self,
  232. results: Union[dict, EvaluationResults],
  233. source_run_id: uuid.UUID,
  234. ) -> Union[EvaluationResult, EvaluationResults]:
  235. if "results" in results:
  236. cp = results.copy()
  237. cp["results"] = [
  238. self._coerce_evaluation_result(r, source_run_id=source_run_id)
  239. for r in results["results"]
  240. ]
  241. return EvaluationResults(**cp)
  242. return self._coerce_evaluation_result(
  243. cast(dict, results), source_run_id=source_run_id, allow_no_key=True
  244. )
  245. def _format_result(
  246. self,
  247. result: Union[
  248. EvaluationResult, EvaluationResults, dict, str, int, bool, float, list
  249. ],
  250. source_run_id: uuid.UUID,
  251. ) -> Union[EvaluationResult, EvaluationResults]:
  252. if isinstance(result, EvaluationResult):
  253. if not result.source_run_id:
  254. result.source_run_id = source_run_id
  255. return result
  256. result = _format_evaluator_result(result)
  257. return self._coerce_evaluation_results(result, source_run_id)
  258. @property
  259. def is_async(self) -> bool:
  260. """Check if the evaluator function is asynchronous.
  261. Returns:
  262. bool: `True` if the evaluator function is asynchronous, `False` otherwise.
  263. """
  264. return hasattr(self, "afunc")
  265. def evaluate_run(
  266. self,
  267. run: Run,
  268. example: Optional[Example] = None,
  269. evaluator_run_id: Optional[uuid.UUID] = None,
  270. ) -> Union[EvaluationResult, EvaluationResults]:
  271. """Evaluate a run using the wrapped function.
  272. This method directly invokes the wrapped function with the provided arguments.
  273. Args:
  274. run (Run): The run to be evaluated.
  275. example (Optional[Example]): An optional example to be used in the evaluation.
  276. Returns:
  277. Union[EvaluationResult, EvaluationResults]: The result of the evaluation.
  278. """ # noqa: E501
  279. if not hasattr(self, "func"):
  280. running_loop = asyncio.get_event_loop()
  281. if running_loop.is_running():
  282. raise RuntimeError(
  283. "Cannot call `evaluate_run` on an async run evaluator from"
  284. " within an running event loop. Use `aevaluate_run` instead."
  285. )
  286. else:
  287. return running_loop.run_until_complete(self.aevaluate_run(run, example))
  288. if evaluator_run_id is None:
  289. evaluator_run_id = uuid.uuid4()
  290. metadata: dict[str, Any] = {"target_run_id": run.id}
  291. if getattr(run, "session_id", None):
  292. metadata["experiment"] = str(run.session_id)
  293. result = self.func(
  294. run,
  295. example,
  296. langsmith_extra={"run_id": evaluator_run_id, "metadata": metadata},
  297. )
  298. return self._format_result(result, evaluator_run_id)
  299. async def aevaluate_run(
  300. self,
  301. run: Run,
  302. example: Optional[Example] = None,
  303. evaluator_run_id: Optional[uuid.UUID] = None,
  304. ):
  305. """Evaluate a run asynchronously using the wrapped async function.
  306. This method directly invokes the wrapped async function with the
  307. provided arguments.
  308. Args:
  309. run (Run): The run to be evaluated.
  310. example (Optional[Example]): An optional example to be used
  311. in the evaluation.
  312. Returns:
  313. Union[EvaluationResult, EvaluationResults]: The result of the evaluation.
  314. """
  315. if not hasattr(self, "afunc"):
  316. return await super().aevaluate_run(run, example)
  317. if evaluator_run_id is None:
  318. evaluator_run_id = uuid.uuid4()
  319. metadata: dict[str, Any] = {"target_run_id": run.id}
  320. if getattr(run, "session_id", None):
  321. metadata["experiment"] = str(run.session_id)
  322. result = await self.afunc(
  323. run,
  324. example,
  325. langsmith_extra={"run_id": evaluator_run_id, "metadata": metadata},
  326. )
  327. return self._format_result(result, evaluator_run_id)
  328. def __call__(
  329. self, run: Run, example: Optional[Example] = None
  330. ) -> Union[EvaluationResult, EvaluationResults]:
  331. """Make the evaluator callable, allowing it to be used like a function.
  332. This method enables the evaluator instance to be called directly, forwarding the
  333. call to `evaluate_run`.
  334. Args:
  335. run (Run): The run to be evaluated.
  336. example (Optional[Example]): An optional example to be used in the evaluation.
  337. Returns:
  338. Union[EvaluationResult, EvaluationResults]: The result of the evaluation.
  339. """ # noqa: E501
  340. return self.evaluate_run(run, example)
  341. def __repr__(self) -> str:
  342. """Represent the DynamicRunEvaluator object."""
  343. return f"<DynamicRunEvaluator {self._name}>"
  344. def run_evaluator(
  345. func: Callable[
  346. [Run, Optional[Example]], Union[_RUNNABLE_OUTPUT, Awaitable[_RUNNABLE_OUTPUT]]
  347. ],
  348. ):
  349. """Create a run evaluator from a function.
  350. Decorator that transforms a function into a `RunEvaluator`.
  351. """
  352. return DynamicRunEvaluator(func)
  353. _MAXSIZE = 10_000
  354. def _maxsize_repr(obj: Any):
  355. s = repr(obj)
  356. if len(s) > _MAXSIZE:
  357. s = s[: _MAXSIZE - 4] + "...)"
  358. return s
  359. class DynamicComparisonRunEvaluator:
  360. """Compare predictions (as traces) from 2 or more runs."""
  361. def __init__(
  362. self,
  363. func: Callable[
  364. [Sequence[Run], Optional[Example]],
  365. Union[_COMPARISON_OUTPUT, Awaitable[_COMPARISON_OUTPUT]],
  366. ],
  367. # Async function to be used for async evaluation. Optional
  368. afunc: Optional[
  369. Callable[
  370. [Sequence[Run], Optional[Example]],
  371. Awaitable[_COMPARISON_OUTPUT],
  372. ]
  373. ] = None,
  374. ):
  375. """Initialize the `DynamicRunEvaluator` with a given function.
  376. Args:
  377. func (Callable): A function that takes a `Run` and an optional `Example` as
  378. arguments, and returns an `EvaluationResult` or `EvaluationResults`.
  379. """
  380. (func, prepare_inputs) = _normalize_comparison_evaluator_func(func)
  381. if afunc:
  382. (afunc, prepare_inputs) = _normalize_comparison_evaluator_func(afunc) # type: ignore[assignment]
  383. def process_inputs(inputs: dict) -> dict:
  384. if prepare_inputs is None:
  385. return inputs
  386. (_, _, traced_inputs) = prepare_inputs(
  387. inputs.get("runs"), inputs.get("example")
  388. )
  389. return traced_inputs
  390. wraps(func)(self)
  391. from langsmith import run_helpers # type: ignore
  392. if afunc is not None:
  393. self.afunc = run_helpers.ensure_traceable(
  394. afunc, process_inputs=process_inputs
  395. )
  396. self._name = getattr(afunc, "__name__", "DynamicRunEvaluator")
  397. if inspect.iscoroutinefunction(func):
  398. if afunc is not None:
  399. raise TypeError(
  400. "Func was provided as a coroutine function, but afunc was "
  401. "also provided. If providing both, func should be a regular "
  402. "function to avoid ambiguity."
  403. )
  404. self.afunc = run_helpers.ensure_traceable(
  405. func, process_inputs=process_inputs
  406. )
  407. self._name = getattr(func, "__name__", "DynamicRunEvaluator")
  408. else:
  409. self.func = run_helpers.ensure_traceable(
  410. cast(
  411. Callable[
  412. [Sequence[Run], Optional[Example]],
  413. _COMPARISON_OUTPUT,
  414. ],
  415. func,
  416. ),
  417. process_inputs=process_inputs,
  418. )
  419. self._name = getattr(func, "__name__", "DynamicRunEvaluator")
  420. @property
  421. def is_async(self) -> bool:
  422. """Check if the evaluator function is asynchronous.
  423. Returns:
  424. bool: `True` if the evaluator function is asynchronous, `False` otherwise.
  425. """
  426. return hasattr(self, "afunc")
  427. def compare_runs(
  428. self, runs: Sequence[Run], example: Optional[Example] = None
  429. ) -> ComparisonEvaluationResult:
  430. """Compare runs to score preferences.
  431. Args:
  432. runs: A list of runs to compare.
  433. example: An optional example to be used in the evaluation.
  434. """ # noqa: E501
  435. if not hasattr(self, "func"):
  436. running_loop = asyncio.get_event_loop()
  437. if running_loop.is_running():
  438. raise RuntimeError(
  439. "Cannot call `evaluate_run` on an async run evaluator from"
  440. " within an running event loop. Use `aevaluate_run` instead."
  441. )
  442. else:
  443. return running_loop.run_until_complete(
  444. self.acompare_runs(runs, example)
  445. )
  446. source_run_id = uuid.uuid4()
  447. tags = self._get_tags(runs)
  448. # TODO: Add metadata for the "comparison experiment" here
  449. result = self.func(
  450. runs,
  451. example,
  452. langsmith_extra={"run_id": source_run_id, "tags": tags},
  453. )
  454. return self._format_results(result, source_run_id, runs)
  455. async def acompare_runs(
  456. self, runs: Sequence[Run], example: Optional[Example] = None
  457. ) -> ComparisonEvaluationResult:
  458. """Evaluate a run asynchronously using the wrapped async function.
  459. This method directly invokes the wrapped async function with the
  460. provided arguments.
  461. Args:
  462. runs (Run): The runs to be evaluated.
  463. example (Optional[Example]): An optional example to be used
  464. in the evaluation.
  465. Returns:
  466. ComparisonEvaluationResult: The result of the evaluation.
  467. """
  468. if not hasattr(self, "afunc"):
  469. return self.compare_runs(runs, example)
  470. source_run_id = uuid.uuid4()
  471. tags = self._get_tags(runs)
  472. # TODO: Add metadata for the "comparison experiment" here
  473. result = await self.afunc(
  474. runs,
  475. example,
  476. langsmith_extra={"run_id": source_run_id, "tags": tags},
  477. )
  478. return self._format_results(result, source_run_id, runs)
  479. def __call__(
  480. self, runs: Sequence[Run], example: Optional[Example] = None
  481. ) -> ComparisonEvaluationResult:
  482. """Make the evaluator callable, allowing it to be used like a function.
  483. This method enables the evaluator instance to be called directly, forwarding the
  484. call to `evaluate_run`.
  485. Args:
  486. run (Run): The run to be evaluated.
  487. example (Optional[Example]): An optional example to be used in the evaluation.
  488. Returns:
  489. ComparisonEvaluationResult: The result of the evaluation.
  490. """ # noqa: E501
  491. return self.compare_runs(runs, example)
  492. def __repr__(self) -> str:
  493. """Represent the DynamicRunEvaluator object."""
  494. return f"<DynamicComparisonRunEvaluator {self._name}>"
  495. @staticmethod
  496. def _get_tags(runs: Sequence[Run]) -> list[str]:
  497. """Extract tags from runs."""
  498. # Add tags to support filtering
  499. tags = []
  500. for run in runs:
  501. tags.append("run:" + str(run.id))
  502. if getattr(run, "session_id", None):
  503. tags.append("experiment:" + str(run.session_id))
  504. return tags
  505. def _format_results(
  506. self,
  507. result: Union[dict, list, ComparisonEvaluationResult],
  508. source_run_id: uuid.UUID,
  509. runs: Sequence[Run],
  510. ) -> ComparisonEvaluationResult:
  511. if isinstance(result, ComparisonEvaluationResult):
  512. if not result.source_run_id:
  513. result.source_run_id = source_run_id
  514. return result
  515. elif isinstance(result, list):
  516. result = {
  517. "scores": {run.id: score for run, score in zip(runs, result)},
  518. "key": self._name,
  519. "source_run_id": source_run_id,
  520. }
  521. elif isinstance(result, dict):
  522. if "key" not in result:
  523. result["key"] = self._name
  524. else:
  525. msg = (
  526. "Expected 'dict', 'list' or 'ComparisonEvaluationResult' result "
  527. f"object. Received: {result=}"
  528. )
  529. raise ValueError(msg)
  530. try:
  531. return ComparisonEvaluationResult(
  532. **{"source_run_id": source_run_id, **result}
  533. )
  534. except ValidationError as e:
  535. raise ValueError(
  536. f"Expected a dictionary with a 'key' and dictionary of scores mapping"
  537. "run IDs to numeric scores, or ComparisonEvaluationResult object,"
  538. f" got {result}"
  539. ) from e
  540. def comparison_evaluator(
  541. func: Callable[
  542. [Sequence[Run], Optional[Example]],
  543. Union[_COMPARISON_OUTPUT, Awaitable[_COMPARISON_OUTPUT]],
  544. ],
  545. ) -> DynamicComparisonRunEvaluator:
  546. """Create a comaprison evaluator from a function."""
  547. return DynamicComparisonRunEvaluator(func)
  548. def _normalize_evaluator_func(
  549. func: Callable,
  550. ) -> tuple[
  551. Union[
  552. Callable[[Run, Optional[Example]], _RUNNABLE_OUTPUT],
  553. Callable[[Run, Optional[Example]], Awaitable[_RUNNABLE_OUTPUT]],
  554. ],
  555. Optional[Callable[..., dict]],
  556. ]:
  557. supported_args = (
  558. "run",
  559. "example",
  560. "inputs",
  561. "outputs",
  562. "reference_outputs",
  563. "attachments",
  564. )
  565. sig = inspect.signature(func)
  566. all_args = [pname for pname, p in sig.parameters.items() if p.kind != p.VAR_KEYWORD]
  567. args_with_defaults = [
  568. pname
  569. for pname, p in sig.parameters.items()
  570. if p.default is not inspect.Parameter.empty
  571. ]
  572. if not all_args or (
  573. not all(
  574. pname in supported_args or pname in args_with_defaults for pname in all_args
  575. )
  576. and len([a for a in all_args if a not in args_with_defaults]) != 2
  577. ):
  578. msg = (
  579. f"Invalid evaluator function. Must have at least one "
  580. f"argument. Supported arguments are {supported_args}. Please "
  581. f"see https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators"
  582. # noqa: E501
  583. )
  584. raise ValueError(msg)
  585. # For backwards compatibility we assume custom arg names are Run and Example
  586. # types, respectively.
  587. elif not all(
  588. pname in supported_args or pname in args_with_defaults for pname in all_args
  589. ) or all_args == [
  590. "run",
  591. "example",
  592. ]:
  593. return func, None
  594. else:
  595. if inspect.iscoroutinefunction(func):
  596. def _prepare_inputs(
  597. run: Run, example: Optional[Example]
  598. ) -> tuple[list, dict, dict]:
  599. arg_map = {
  600. "run": run,
  601. "example": example,
  602. "inputs": example.inputs if example else {},
  603. "outputs": run.outputs or {},
  604. "attachments": example.attachments or {} if example else {},
  605. "reference_outputs": example.outputs or {} if example else {},
  606. }
  607. kwargs = {}
  608. args = []
  609. traced_inputs = {}
  610. for param_name, param in sig.parameters.items():
  611. # Could have params with defaults that are not in the arg map
  612. if param_name in arg_map:
  613. if param.kind in (
  614. param.POSITIONAL_OR_KEYWORD,
  615. param.POSITIONAL_ONLY,
  616. ):
  617. args.append(arg_map[param_name])
  618. else:
  619. kwargs[param_name] = arg_map[param_name]
  620. traced_inputs[param_name] = (
  621. _maxsize_repr(arg_map[param_name])
  622. if param_name in ("run", "example")
  623. else arg_map[param_name]
  624. )
  625. return args, kwargs, traced_inputs
  626. async def awrapper(
  627. run: Run, example: Optional[Example]
  628. ) -> _RUNNABLE_OUTPUT:
  629. (args, kwargs, _) = _prepare_inputs(run, example)
  630. return await func(*args, **kwargs)
  631. awrapper.__name__ = (
  632. getattr(func, "__name__")
  633. if hasattr(func, "__name__")
  634. else awrapper.__name__
  635. )
  636. return (awrapper, _prepare_inputs) # type: ignore[return-value]
  637. else:
  638. def _prepare_inputs(
  639. run: Run, example: Optional[Example]
  640. ) -> tuple[list, dict, dict]:
  641. arg_map = {
  642. "run": run,
  643. "example": example,
  644. "inputs": example.inputs if example else {},
  645. "outputs": run.outputs or {},
  646. "attachments": example.attachments or {} if example else {},
  647. "reference_outputs": example.outputs or {} if example else {},
  648. }
  649. kwargs = {}
  650. args = []
  651. traced_inputs = {}
  652. for param_name, param in sig.parameters.items():
  653. # Could have params with defaults that are not in the arg map
  654. if param_name in arg_map:
  655. if param.kind in (
  656. param.POSITIONAL_OR_KEYWORD,
  657. param.POSITIONAL_ONLY,
  658. ):
  659. args.append(arg_map[param_name])
  660. else:
  661. kwargs[param_name] = arg_map[param_name]
  662. traced_inputs[param_name] = (
  663. _maxsize_repr(arg_map[param_name])
  664. if param_name in ("run", "example")
  665. else arg_map[param_name]
  666. )
  667. return args, kwargs, traced_inputs
  668. def wrapper(run: Run, example: Optional[Example]) -> _RUNNABLE_OUTPUT:
  669. (args, kwargs, _) = _prepare_inputs(run, example)
  670. return func(*args, **kwargs)
  671. wrapper.__name__ = (
  672. getattr(func, "__name__")
  673. if hasattr(func, "__name__")
  674. else wrapper.__name__
  675. )
  676. return (wrapper, _prepare_inputs) # type: ignore[return-value]
  677. def _normalize_comparison_evaluator_func(
  678. func: Callable,
  679. ) -> tuple[
  680. Union[
  681. Callable[[Sequence[Run], Optional[Example]], _COMPARISON_OUTPUT],
  682. Callable[[Sequence[Run], Optional[Example]], Awaitable[_COMPARISON_OUTPUT]],
  683. ],
  684. Optional[Callable[..., dict]],
  685. ]:
  686. supported_args = ("runs", "example", "inputs", "outputs", "reference_outputs")
  687. sig = inspect.signature(func)
  688. all_args = [pname for pname, p in sig.parameters.items() if p.kind != p.VAR_KEYWORD]
  689. args_with_defaults = [
  690. pname
  691. for pname, p in sig.parameters.items()
  692. if p.default is not inspect.Parameter.empty
  693. ]
  694. if not all_args or (
  695. not all(
  696. pname in supported_args or pname in args_with_defaults for pname in all_args
  697. )
  698. and len([a for a in all_args if a not in args_with_defaults]) != 2
  699. ):
  700. msg = (
  701. f"Invalid evaluator function. Must have at least one "
  702. f"argument. Supported arguments are {supported_args}. Please "
  703. f"see https://docs.smith.langchain.com/evaluation/how_to_guides/evaluation/evaluate_llm_application#use-custom-evaluators"
  704. # noqa: E501
  705. )
  706. raise ValueError(msg)
  707. # For backwards compatibility we assume custom arg names are List[Run] and
  708. # List[Example] types, respectively.
  709. elif not all(
  710. pname in supported_args or pname in args_with_defaults for pname in all_args
  711. ) or all_args == [
  712. "runs",
  713. "example",
  714. ]:
  715. return func, None
  716. else:
  717. if inspect.iscoroutinefunction(func):
  718. def _prepare_inputs(
  719. runs: Sequence[Run], example: Optional[Example]
  720. ) -> tuple[list, dict, dict]:
  721. arg_map = {
  722. "runs": runs,
  723. "example": example,
  724. "inputs": example.inputs if example else {},
  725. "outputs": [run.outputs or {} for run in runs],
  726. "reference_outputs": example.outputs or {} if example else {},
  727. }
  728. kwargs = {}
  729. args = []
  730. traced_inputs = {}
  731. for param_name, param in sig.parameters.items():
  732. # Could have params with defaults that are not in the arg map
  733. if param_name in arg_map:
  734. if param.kind in (
  735. param.POSITIONAL_OR_KEYWORD,
  736. param.POSITIONAL_ONLY,
  737. ):
  738. args.append(arg_map[param_name])
  739. else:
  740. kwargs[param_name] = arg_map[param_name]
  741. traced_inputs[param_name] = (
  742. _maxsize_repr(arg_map[param_name])
  743. if param_name in ("runs", "example")
  744. else arg_map[param_name]
  745. )
  746. return args, kwargs, traced_inputs
  747. async def awrapper(
  748. runs: Sequence[Run], example: Optional[Example]
  749. ) -> _COMPARISON_OUTPUT:
  750. (args, kwargs, _) = _prepare_inputs(runs, example)
  751. return await func(*args, **kwargs)
  752. awrapper.__name__ = (
  753. getattr(func, "__name__")
  754. if hasattr(func, "__name__")
  755. else awrapper.__name__
  756. )
  757. return awrapper, _prepare_inputs # type: ignore[return-value]
  758. else:
  759. def _prepare_inputs(
  760. runs: Sequence[Run], example: Optional[Example]
  761. ) -> tuple[list, dict, dict]:
  762. arg_map = {
  763. "runs": runs,
  764. "example": example,
  765. "inputs": example.inputs if example else {},
  766. "outputs": [run.outputs or {} for run in runs],
  767. "reference_outputs": example.outputs or {} if example else {},
  768. }
  769. kwargs = {}
  770. args = []
  771. traced_inputs = {}
  772. for param_name, param in sig.parameters.items():
  773. # Could have params with defaults that are not in the arg map
  774. if param_name in arg_map:
  775. if param.kind in (
  776. param.POSITIONAL_OR_KEYWORD,
  777. param.POSITIONAL_ONLY,
  778. ):
  779. args.append(arg_map[param_name])
  780. else:
  781. kwargs[param_name] = arg_map[param_name]
  782. traced_inputs[param_name] = (
  783. _maxsize_repr(arg_map[param_name])
  784. if param_name in ("runs", "example")
  785. else arg_map[param_name]
  786. )
  787. return args, kwargs, traced_inputs
  788. def wrapper(
  789. runs: Sequence[Run], example: Optional[Example]
  790. ) -> _COMPARISON_OUTPUT:
  791. (args, kwargs, _) = _prepare_inputs(runs, example)
  792. return func(*args, **kwargs)
  793. wrapper.__name__ = (
  794. getattr(func, "__name__")
  795. if hasattr(func, "__name__")
  796. else wrapper.__name__
  797. )
  798. return wrapper, _prepare_inputs # type: ignore[return-value]
  799. def _format_evaluator_result(
  800. result: Union[EvaluationResults, dict, str, int, bool, float, list],
  801. ) -> Union[EvaluationResults, dict]:
  802. if isinstance(result, (bool, float, int)):
  803. result = {"score": result}
  804. elif not result:
  805. raise ValueError(
  806. f"Expected a non-empty dict, str, bool, int, float, list, "
  807. f"EvaluationResult, or EvaluationResults. Got {result}"
  808. )
  809. elif isinstance(result, list):
  810. if not all(isinstance(x, dict) for x in result):
  811. raise ValueError(
  812. f"Expected a list of dicts or EvaluationResults. Received {result}."
  813. )
  814. result = {"results": result} # type: ignore[misc]
  815. elif isinstance(result, str):
  816. result = {"value": result}
  817. elif isinstance(result, dict):
  818. pass
  819. else:
  820. raise ValueError(
  821. f"Expected a dict, str, bool, int, float, list, EvaluationResult, or "
  822. f"EvaluationResults. Got {result}"
  823. )
  824. return result
  825. SUMMARY_EVALUATOR_T = Union[
  826. Callable[
  827. [Sequence[schemas.Run], Sequence[schemas.Example]],
  828. Union[EvaluationResult, EvaluationResults],
  829. ],
  830. Callable[
  831. [list[schemas.Run], list[schemas.Example]],
  832. Union[EvaluationResult, EvaluationResults],
  833. ],
  834. ]
  835. def _normalize_summary_evaluator(func: Callable) -> SUMMARY_EVALUATOR_T:
  836. supported_args = ("runs", "examples", "inputs", "outputs", "reference_outputs")
  837. sig = inspect.signature(func)
  838. all_args = [pname for pname, p in sig.parameters.items()]
  839. args_with_defaults = [
  840. pname
  841. for pname, p in sig.parameters.items()
  842. if p.default is not inspect.Parameter.empty
  843. ]
  844. if not all_args or (
  845. not all(
  846. pname in supported_args or pname in args_with_defaults for pname in all_args
  847. )
  848. and len([a for a in all_args if a not in args_with_defaults]) != 2
  849. ):
  850. msg = (
  851. f"Invalid evaluator function. Must have at least one "
  852. f"argument. Supported arguments are {supported_args}."
  853. )
  854. if all_args:
  855. msg += f" Received arguments {all_args}."
  856. raise ValueError(msg)
  857. # For backwards compatibility we assume custom arg names are Sequence[Run] and
  858. # Sequence[Example] types, respectively.
  859. elif not all(pname in supported_args for pname in all_args) or all_args == [
  860. "runs",
  861. "examples",
  862. ]:
  863. return func
  864. else:
  865. def wrapper(
  866. runs: Sequence[schemas.Run], examples: Sequence[schemas.Example]
  867. ) -> Union[EvaluationResult, EvaluationResults]:
  868. arg_map = {
  869. "runs": runs,
  870. "examples": examples,
  871. "inputs": [example.inputs for example in examples],
  872. "outputs": [run.outputs or {} for run in runs],
  873. "reference_outputs": [example.outputs or {} for example in examples],
  874. }
  875. kwargs = {}
  876. args = []
  877. for param_name, param in sig.parameters.items():
  878. # Could have params with defaults that are not in the arg map
  879. if param_name in arg_map:
  880. if param.kind in (
  881. param.POSITIONAL_OR_KEYWORD,
  882. param.POSITIONAL_ONLY,
  883. ):
  884. args.append(arg_map[param_name])
  885. else:
  886. kwargs[param_name] = arg_map[param_name]
  887. result = func(*args, **kwargs)
  888. if isinstance(result, EvaluationResult):
  889. return result
  890. return _format_evaluator_result(result) # type: ignore
  891. wrapper.__name__ = (
  892. getattr(func, "__name__") if hasattr(func, "__name__") else wrapper.__name__
  893. )
  894. return wrapper # type: ignore[return-value]