diff --git a/packages/uipath/src/uipath/eval/runtime/runtime.py b/packages/uipath/src/uipath/eval/runtime/runtime.py index 7f7614446..deef0226c 100644 --- a/packages/uipath/src/uipath/eval/runtime/runtime.py +++ b/packages/uipath/src/uipath/eval/runtime/runtime.py @@ -1022,6 +1022,9 @@ async def run_evaluator( agent_output=output_data, agent_trace=execution_output.spans, expected_agent_behavior=eval_item.expected_agent_behavior, + simulation_instructions=eval_item.mocking_strategy.prompt + if isinstance(eval_item.mocking_strategy, LLMMockingStrategy) + else "", ) result = await evaluator.validate_and_evaluate_criteria( diff --git a/packages/uipath/tests/cli/eval/test_eval_tracing_integration.py b/packages/uipath/tests/cli/eval/test_eval_tracing_integration.py index b106d52ef..f69bc7d64 100644 --- a/packages/uipath/tests/cli/eval/test_eval_tracing_integration.py +++ b/packages/uipath/tests/cli/eval/test_eval_tracing_integration.py @@ -12,8 +12,10 @@ import pytest from uipath.eval.evaluators import BaseEvaluator +from uipath.eval.mocks._types import LLMMockingStrategy from uipath.eval.models import NumericEvaluationResult from uipath.eval.models.evaluation_set import EvaluationSet +from uipath.eval.models.models import AgentExecution from uipath.eval.runtime import UiPathEvalContext, UiPathEvalRuntime from uipath.runtime.schema import UiPathRuntimeSchema @@ -402,6 +404,64 @@ async def test_run_evaluator_creates_evaluator_span( assert span["attributes"]["evaluator_name"] == "AccuracyEvaluator" assert span["attributes"]["eval_item_id"] == "eval-item-456" + @pytest.mark.asyncio + async def test_run_evaluator_passes_simulation_instructions( + self, + mock_trace_manager: MagicMock, + mock_factory: MagicMock, + mock_event_bus: MagicMock, + mock_execution_output: MagicMock, + ) -> None: + """Test that trajectory evaluators receive simulation instructions.""" + context = create_eval_context( + eval_set="test.json", + entrypoint="main.py:main", + ) + + runtime = UiPathEvalRuntime( + context=context, + factory=mock_factory, + trace_manager=mock_trace_manager, + event_bus=mock_event_bus, + ) + + eval_item = MagicMock() + eval_item.id = "eval-item-with-simulation" + eval_item.name = "Simulated item" + eval_item.inputs = {"input": "test"} + eval_item.expected_agent_behavior = "Agent should use the simulated tool" + eval_item.mocking_strategy = LLMMockingStrategy( + prompt="Return mocked API responses for the tool calls", + tools_to_simulate=[], + ) + + evaluator = MagicMock(spec=BaseEvaluator) + evaluator.id = "trajectory-evaluator" + evaluator.name = "TrajectoryEvaluator" + + async def capture_agent_execution( + agent_execution: AgentExecution, + evaluation_criteria: object, + ) -> NumericEvaluationResult: + assert ( + agent_execution.simulation_instructions + == "Return mocked API responses for the tool calls" + ) + return NumericEvaluationResult(score=1.0) + + evaluator.validate_and_evaluate_criteria = AsyncMock( + side_effect=capture_agent_execution + ) + + await runtime.run_evaluator( + evaluator=evaluator, + execution_output=mock_execution_output, + eval_item=eval_item, + evaluation_criteria=None, + ) + + evaluator.validate_and_evaluate_criteria.assert_awaited_once() + @pytest.mark.asyncio async def test_multiple_evaluators_create_multiple_spans( self, diff --git a/packages/uipath/tests/evaluators/test_evaluator_methods.py b/packages/uipath/tests/evaluators/test_evaluator_methods.py index ec795499d..fc0ebed88 100644 --- a/packages/uipath/tests/evaluators/test_evaluator_methods.py +++ b/packages/uipath/tests/evaluators/test_evaluator_methods.py @@ -1728,6 +1728,70 @@ async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: assert isinstance(result, NumericEvaluationResult) assert result.score == 0.9 + @pytest.mark.asyncio + async def test_llm_trajectory_replaces_all_prompt_placeholders( + self, sample_agent_execution: AgentExecution, mocker: MockerFixture + ) -> None: + """Test trajectory prompt interpolation for all built-in placeholders.""" + captured_prompt = "" + + mock_tool_call = mocker.MagicMock() + mock_tool_call.id = "call_1" + mock_tool_call.name = "submit_evaluation" + mock_tool_call.arguments = { + "score": 90, + "justification": "The agent followed the expected behavior", + } + + mock_response = mocker.MagicMock() + mock_response.choices = [ + mocker.MagicMock( + message=mocker.MagicMock(content=None, tool_calls=[mock_tool_call]) + ) + ] + + async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: + nonlocal captured_prompt + captured_prompt = kwargs["messages"][-1]["content"] + return mock_response + + mock_llm_instance = mocker.MagicMock() + mock_llm_instance.chat_completions = mock_chat_completions + + mocker.patch("uipath.eval.evaluators.llm_as_judge_evaluator.UiPath") + mocker.patch( + "uipath.eval.evaluators.llm_as_judge_evaluator.UiPathLlmChatService", + return_value=mock_llm_instance, + ) + + config = { + "name": "LlmTrajectoryTest", + "prompt": ( + "input={{UserOrSyntheticInput}}\n" + "instructions={{SimulationInstructions}}\n" + "expected={{ExpectedAgentBehavior}}\n" + "history={{AgentRunHistory}}" + ), + "model": "gpt-4", + } + evaluator = LLMJudgeTrajectoryEvaluator.model_validate( + {"evaluatorConfig": config, "id": str(uuid.uuid4())} + ) + agent_execution = sample_agent_execution.model_copy( + update={"simulation_instructions": "Mock the backend API response"} + ) + criteria = TrajectoryEvaluationCriteria( + expected_agent_behavior="Agent should respond helpfully" + ) + + result = await evaluator.evaluate(agent_execution, criteria) + + assert isinstance(result, NumericEvaluationResult) + assert "{{" not in captured_prompt + assert "Agent should respond helpfully" in captured_prompt + assert "Mock the backend API response" in captured_prompt + assert "{'input': 'Test input'}" in captured_prompt + @pytest.mark.asyncio async def test_llm_trajectory_validate_and_evaluate_criteria( self, sample_agent_execution: AgentExecution, mocker: MockerFixture