-
Notifications
You must be signed in to change notification settings - Fork 1
Add LLM judge evaluator factory #39
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
827fb3f
483c530
8ae7889
f5820da
52b488d
0791d2c
5102e4c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,17 @@ | ||
| """Graders for agent evaluations. | ||
| This subpackage contains evaluator factories that can be shared across | ||
| agent domains. The factories return Langfuse-compatible evaluator callables | ||
| that can be passed directly to ``dataset.run_experiment`` or the wrappers in the | ||
| evaluation harness. | ||
| """ | ||
|
|
||
| from .llm_judge import DEFAULT_LLM_JUDGE_RUBRIC, LLMJudgeMetric, LLMJudgeResponse, create_llm_as_judge_evaluator | ||
|
|
||
|
|
||
| __all__ = [ | ||
| "DEFAULT_LLM_JUDGE_RUBRIC", | ||
| "LLMJudgeMetric", | ||
| "LLMJudgeResponse", | ||
| "create_llm_as_judge_evaluator", | ||
| ] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,210 @@ | ||
| """Shared helpers for OpenAI-compatible LLM-based graders.""" | ||
|
|
||
| import json | ||
| from pathlib import Path | ||
| from typing import Any, TypeVar, cast | ||
|
|
||
| from aieng.agent_evals.evaluation.graders.config import LLMRequestConfig | ||
| from aieng.agent_evals.evaluation.types import Evaluation | ||
| from langfuse.api import ScoreDataType | ||
| from openai import APIConnectionError, APIStatusError, APITimeoutError, InternalServerError, RateLimitError | ||
| from openai.types.chat.parsed_chat_completion import ParsedChatCompletion | ||
| from pydantic import BaseModel | ||
| from tenacity import AsyncRetrying, retry_if_exception, stop_after_attempt, wait_exponential | ||
|
|
||
|
|
||
| T = TypeVar("T", bound=BaseModel) | ||
|
|
||
|
|
||
| async def run_structured_parse_call( | ||
| *, | ||
| openai_client: Any, | ||
| default_model: str, | ||
| model_config: LLMRequestConfig, | ||
| system_prompt: str, | ||
| user_prompt: str, | ||
| response_format: type[T], | ||
| ) -> ParsedChatCompletion[T]: | ||
| """Run ``chat.completions.parse`` with retry for transient API failures. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| openai_client : Any | ||
| OpenAI-compatible async client instance. | ||
| default_model : str | ||
| Fallback model name when ``model_config.model`` is not provided. | ||
| model_config : LLMRequestConfig | ||
| Request and retry configuration. | ||
| system_prompt : str | ||
| System prompt content. | ||
| user_prompt : str | ||
| User prompt content. | ||
| response_format : type[T] | ||
| Pydantic model used by ``parse`` for structured output. | ||
|
|
||
| Returns | ||
| ------- | ||
| ParsedChatCompletion[T] | ||
| Completion object returned by ``chat.completions.parse``. | ||
| """ | ||
| model_name = model_config.model or default_model | ||
| request_kwargs: dict[str, Any] = dict(model_config.extra_request_kwargs) | ||
| request_kwargs.update( | ||
| { | ||
| "model": model_name, | ||
| "messages": [ | ||
| {"role": "system", "content": system_prompt}, | ||
| {"role": "user", "content": user_prompt}, | ||
| ], | ||
| "response_format": response_format, | ||
| "temperature": model_config.temperature, | ||
| } | ||
| ) | ||
| if model_config.max_completion_tokens is not None: | ||
| request_kwargs["max_completion_tokens"] = model_config.max_completion_tokens | ||
| if model_config.timeout_sec is not None: | ||
| request_kwargs["timeout"] = model_config.timeout_sec | ||
|
|
||
| retrying = AsyncRetrying( | ||
| stop=stop_after_attempt(model_config.retry_max_attempts), | ||
| wait=wait_exponential( | ||
| multiplier=model_config.retry_backoff_multiplier, | ||
| min=model_config.retry_initial_wait_sec, | ||
| max=model_config.retry_max_wait_sec, | ||
| ), | ||
| retry=retry_if_exception(is_retryable_api_exception), | ||
| reraise=True, | ||
| ) | ||
|
|
||
| async for attempt in retrying: | ||
| with attempt: | ||
| response = await openai_client.chat.completions.parse(**request_kwargs) | ||
| return cast(ParsedChatCompletion[T], response) | ||
|
|
||
| # Defensive fallback: tenacity should either return above or raise. | ||
| raise RuntimeError("Structured parse call failed unexpectedly without a result.") | ||
|
|
||
|
|
||
| def is_retryable_api_exception(exc: BaseException) -> bool: | ||
| """Return True when exception is likely transient and should be retried.""" | ||
| if isinstance(exc, (APIConnectionError, APITimeoutError, RateLimitError, InternalServerError)): | ||
| return True | ||
|
|
||
| if isinstance(exc, APIStatusError): | ||
| status = getattr(exc, "status_code", None) | ||
| return status in (408, 429) or (status is not None and status >= 500) | ||
|
|
||
| return False | ||
|
|
||
|
|
||
| def build_error_evaluation(*, name: str, error: Exception, prefix: str) -> Evaluation: | ||
| """Build a deterministic error metric. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| name : str | ||
| Metric name. | ||
| error : Exception | ||
| Error that triggered the fallback metric. | ||
| prefix : str | ||
| Prefix used in the metric comment for context. | ||
|
|
||
| Returns | ||
| ------- | ||
| Evaluation | ||
| Boolean error evaluation containing structured error metadata. | ||
| """ | ||
| message = str(error) or error.__class__.__name__ | ||
| return Evaluation( | ||
| name=name, | ||
| value=True, | ||
| comment=f"{prefix}: {message}", | ||
| data_type=ScoreDataType.BOOLEAN, | ||
| metadata={"error_type": error.__class__.__name__, "error": message}, | ||
| ) | ||
|
|
||
|
|
||
| def render_system_prompt_with_optional_rubric(*, system_prompt_template: str, rubric_text: str | None) -> str: | ||
| """Render system prompt and inject rubric text when available. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| system_prompt_template : str | ||
| Base system prompt template. | ||
| rubric_text : str | None | ||
| Rubric content in markdown format. | ||
|
|
||
| Returns | ||
| ------- | ||
| str | ||
| Rendered system prompt with rubric inserted or appended. | ||
| """ | ||
| rubric_section = "" | ||
| if rubric_text: | ||
| rubric_section = f"# Rubric\n{rubric_text.strip()}" | ||
|
|
||
| if "{rubric_section}" in system_prompt_template: | ||
| return system_prompt_template.format(rubric_section=rubric_section) | ||
|
|
||
| if rubric_section: | ||
| # Appending rubric keeps custom system templates simple when users omit | ||
| # placeholders in quick evaluator setup. | ||
| return f"{system_prompt_template.rstrip()}\n\n{rubric_section}\n" | ||
|
|
||
| return system_prompt_template | ||
|
|
||
|
|
||
| def load_markdown(markdown: str | Path | None) -> str | None: | ||
| """Load markdown from raw string or file path. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| markdown : str | Path | None | ||
| Markdown text or file path. | ||
|
|
||
| Returns | ||
| ------- | ||
| str | None | ||
| Loaded markdown text, or ``None`` when not provided. | ||
| """ | ||
| if markdown is None: | ||
| return None | ||
| if isinstance(markdown, Path): | ||
| return markdown.read_text(encoding="utf-8") | ||
|
|
||
| path_candidate = Path(markdown) | ||
| if path_candidate.suffix.lower() == ".md" and path_candidate.exists(): | ||
| return path_candidate.read_text(encoding="utf-8") | ||
| return markdown | ||
|
|
||
|
|
||
| def serialize_for_prompt(value: Any) -> str: | ||
| """Serialize values to readable JSON-like prompt text. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| value : Any | ||
| Value to serialize. | ||
|
|
||
| Returns | ||
| ------- | ||
| str | ||
| JSON-like string representation suitable for prompts. | ||
| """ | ||
| try: | ||
| # Keep unicode characters readable and stabilize formatting for | ||
| # deterministic prompt snapshots during tests. | ||
| return json.dumps(value, ensure_ascii=False, indent=2, default=str) | ||
| except (TypeError, ValueError): | ||
| return str(value) | ||
|
|
||
|
|
||
| __all__ = [ | ||
| "LLMRequestConfig", | ||
| "build_error_evaluation", | ||
| "is_retryable_api_exception", | ||
| "load_markdown", | ||
| "render_system_prompt_with_optional_rubric", | ||
| "run_structured_parse_call", | ||
| "serialize_for_prompt", | ||
| ] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,44 @@ | ||
| """Configuration classes for LLM-based graders.""" | ||
|
|
||
| from dataclasses import dataclass, field | ||
| from typing import Any | ||
|
|
||
|
|
||
| @dataclass(frozen=True) | ||
| class LLMRequestConfig: | ||
| """Configuration for the underlying judge model call. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| model : str | None, optional, default=None | ||
| Explicit model name for the judge. If omitted, the harness default | ||
| evaluator model is used. | ||
| temperature : float, optional, default=0.0 | ||
| Sampling temperature for the judge call. | ||
| max_completion_tokens : int | None, optional, default=None | ||
| Optional token cap for the judge completion. | ||
| timeout_sec : float | None, optional, default=None | ||
| Optional request timeout in seconds. | ||
| extra_request_kwargs : dict[str, Any], optional, default_factory=dict | ||
| Additional OpenAI-compatible request arguments forwarded to | ||
| ``chat.completions.parse``. | ||
| retry_max_attempts : int, optional, default=5 | ||
| Maximum number of attempts for transient judge API failures. Set to | ||
| ``1`` to disable retries. | ||
| retry_initial_wait_sec : float, optional, default=1.0 | ||
| Initial backoff delay in seconds. | ||
| retry_max_wait_sec : float, optional, default=10.0 | ||
| Maximum backoff delay in seconds. | ||
| retry_backoff_multiplier : float, optional, default=2.0 | ||
| Exponential backoff multiplier. | ||
| """ | ||
|
|
||
| model: str | None = None | ||
| temperature: float = 0.0 | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just FYI, i thought it made sense to also have this as a config variable so i introduced it
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oops clicked request changes by mistake. |
||
| max_completion_tokens: int | None = None | ||
| timeout_sec: float | None = None | ||
| extra_request_kwargs: dict[str, Any] = field(default_factory=dict) | ||
| retry_max_attempts: int = 5 | ||
| retry_initial_wait_sec: float = 1.0 | ||
| retry_max_wait_sec: float = 10.0 | ||
| retry_backoff_multiplier: float = 2.0 | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do you need the
*as the first parameter?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's for enforcing keyword-only arguments, so the caller has to provide the names of the arguments. It's not strictly necessary since it's inside a 'private' module. I'd still keep it as is since it doesn't hurt.