Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions packages/uipath/src/uipath/eval/runtime/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1022,6 +1022,9 @@ async def run_evaluator(
agent_output=output_data,
agent_trace=execution_output.spans,
expected_agent_behavior=eval_item.expected_agent_behavior,
simulation_instructions=eval_item.mocking_strategy.prompt
if isinstance(eval_item.mocking_strategy, LLMMockingStrategy)
else "",
)

result = await evaluator.validate_and_evaluate_criteria(
Expand Down
60 changes: 60 additions & 0 deletions packages/uipath/tests/cli/eval/test_eval_tracing_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@
import pytest

from uipath.eval.evaluators import BaseEvaluator
from uipath.eval.mocks._types import LLMMockingStrategy
from uipath.eval.models import NumericEvaluationResult
from uipath.eval.models.evaluation_set import EvaluationSet
from uipath.eval.models.models import AgentExecution
from uipath.eval.runtime import UiPathEvalContext, UiPathEvalRuntime
from uipath.runtime.schema import UiPathRuntimeSchema

Expand Down Expand Up @@ -402,6 +404,64 @@ async def test_run_evaluator_creates_evaluator_span(
assert span["attributes"]["evaluator_name"] == "AccuracyEvaluator"
assert span["attributes"]["eval_item_id"] == "eval-item-456"

@pytest.mark.asyncio
async def test_run_evaluator_passes_simulation_instructions(
self,
mock_trace_manager: MagicMock,
mock_factory: MagicMock,
mock_event_bus: MagicMock,
mock_execution_output: MagicMock,
) -> None:
"""Test that trajectory evaluators receive simulation instructions."""
context = create_eval_context(
eval_set="test.json",
entrypoint="main.py:main",
)

runtime = UiPathEvalRuntime(
context=context,
factory=mock_factory,
trace_manager=mock_trace_manager,
event_bus=mock_event_bus,
)

eval_item = MagicMock()
eval_item.id = "eval-item-with-simulation"
eval_item.name = "Simulated item"
eval_item.inputs = {"input": "test"}
eval_item.expected_agent_behavior = "Agent should use the simulated tool"
eval_item.mocking_strategy = LLMMockingStrategy(
prompt="Return mocked API responses for the tool calls",
tools_to_simulate=[],
)

evaluator = MagicMock(spec=BaseEvaluator)
evaluator.id = "trajectory-evaluator"
evaluator.name = "TrajectoryEvaluator"

async def capture_agent_execution(
agent_execution: AgentExecution,
evaluation_criteria: object,
) -> NumericEvaluationResult:
assert (
agent_execution.simulation_instructions
== "Return mocked API responses for the tool calls"
)
return NumericEvaluationResult(score=1.0)

evaluator.validate_and_evaluate_criteria = AsyncMock(
side_effect=capture_agent_execution
)

await runtime.run_evaluator(
evaluator=evaluator,
execution_output=mock_execution_output,
eval_item=eval_item,
evaluation_criteria=None,
)

evaluator.validate_and_evaluate_criteria.assert_awaited_once()

@pytest.mark.asyncio
async def test_multiple_evaluators_create_multiple_spans(
self,
Expand Down
64 changes: 64 additions & 0 deletions packages/uipath/tests/evaluators/test_evaluator_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -1728,6 +1728,70 @@ async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any:
assert isinstance(result, NumericEvaluationResult)
assert result.score == 0.9

@pytest.mark.asyncio
async def test_llm_trajectory_replaces_all_prompt_placeholders(
self, sample_agent_execution: AgentExecution, mocker: MockerFixture
) -> None:
"""Test trajectory prompt interpolation for all built-in placeholders."""
captured_prompt = ""

mock_tool_call = mocker.MagicMock()
mock_tool_call.id = "call_1"
mock_tool_call.name = "submit_evaluation"
mock_tool_call.arguments = {
"score": 90,
"justification": "The agent followed the expected behavior",
}

mock_response = mocker.MagicMock()
mock_response.choices = [
mocker.MagicMock(
message=mocker.MagicMock(content=None, tool_calls=[mock_tool_call])
)
]

async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any:
nonlocal captured_prompt
captured_prompt = kwargs["messages"][-1]["content"]
return mock_response

mock_llm_instance = mocker.MagicMock()
mock_llm_instance.chat_completions = mock_chat_completions

mocker.patch("uipath.eval.evaluators.llm_as_judge_evaluator.UiPath")
mocker.patch(
"uipath.eval.evaluators.llm_as_judge_evaluator.UiPathLlmChatService",
return_value=mock_llm_instance,
)

config = {
"name": "LlmTrajectoryTest",
"prompt": (
"input={{UserOrSyntheticInput}}\n"
"instructions={{SimulationInstructions}}\n"
"expected={{ExpectedAgentBehavior}}\n"
"history={{AgentRunHistory}}"
),
"model": "gpt-4",
}
evaluator = LLMJudgeTrajectoryEvaluator.model_validate(
{"evaluatorConfig": config, "id": str(uuid.uuid4())}
)
agent_execution = sample_agent_execution.model_copy(
update={"simulation_instructions": "Mock the backend API response"}
)
criteria = TrajectoryEvaluationCriteria(
expected_agent_behavior="Agent should respond helpfully"
)

result = await evaluator.evaluate(agent_execution, criteria)

assert isinstance(result, NumericEvaluationResult)
assert "{{" not in captured_prompt
assert "Agent should respond helpfully" in captured_prompt
assert "Mock the backend API response" in captured_prompt
assert "{'input': 'Test input'}" in captured_prompt

@pytest.mark.asyncio
async def test_llm_trajectory_validate_and_evaluate_criteria(
self, sample_agent_execution: AgentExecution, mocker: MockerFixture
Expand Down
Loading