diff --git a/packages/gooddata-eval/src/gooddata_eval/cli/main.py b/packages/gooddata-eval/src/gooddata_eval/cli/main.py index d243bcb2a..336ec298f 100644 --- a/packages/gooddata-eval/src/gooddata_eval/cli/main.py +++ b/packages/gooddata-eval/src/gooddata_eval/cli/main.py @@ -97,6 +97,12 @@ def _build_parser() -> argparse.ArgumentParser: ) run.add_argument("--json", dest="json_path", help="Write a JSON report to this path.") run.add_argument("--quiet", action="store_true", help="Suppress per-item progress output.") + run.add_argument( + "--preserve-failed", + action="store_true", + dest="preserve_failed", + help="Keep failed conversations on the server for post-mortem inspection.", + ) run.add_argument( "--langfuse", action="store_true", @@ -319,7 +325,12 @@ def on_langfuse_item_done( # --- non-agentic items (single-turn, use Evaluator) --- backend = _RoutingBackend( - ChatClient(host=config.host, token=config.token, workspace_id=config.workspace_id), + ChatClient( + host=config.host, + token=config.token, + workspace_id=config.workspace_id, + preserve_failed=config.preserve_failed, + ), SummaryClient(host=config.host, token=config.token, workspace_id=config.workspace_id), ) try: @@ -409,6 +420,7 @@ def main(argv: list[str] | None = None) -> int: log_to_langfuse=args.langfuse, quiet=args.quiet, kind=args.kind, + preserve_failed=args.preserve_failed, ) return _run(config) except ( diff --git a/packages/gooddata-eval/src/gooddata_eval/core/chat/sse_client.py b/packages/gooddata-eval/src/gooddata_eval/core/chat/sse_client.py index 9d0056c4c..4d3f93f9e 100644 --- a/packages/gooddata-eval/src/gooddata_eval/core/chat/sse_client.py +++ b/packages/gooddata-eval/src/gooddata_eval/core/chat/sse_client.py @@ -106,6 +106,7 @@ class _SseAccumulator: call_id_to_event_index: dict[str, int] = field(default_factory=dict) reasoning_steps: list[dict[str, Any]] = field(default_factory=list) adhoc_viz_args: list[dict[str, Any]] = field(default_factory=list) + response_id: str | None = None def _handle_text(content: dict[str, Any], acc: _SseAccumulator) -> None: @@ -176,7 +177,9 @@ def _build_chat_result(acc: _SseAccumulator) -> ChatResult: "objects": [acc.adhoc_viz_args[-1]], "reasoning": "\n".join(acc.viz_reasoning_parts), } - return ChatResult.model_validate(payload) + result = ChatResult.model_validate(payload) + result.response_id = acc.response_id + return result def parse_sse_lines(lines: Iterable[str]) -> ChatResult: @@ -204,9 +207,13 @@ def parse_sse_lines(lines: Iterable[str]) -> ChatResult: if code in _RETRYABLE_STATUS_CODES: raise TransientChatError(message, status_code=code, detail=detail) raise ChatError(message, status_code=code, detail=detail) + if event_data.get("responseId") and not acc.response_id: + acc.response_id = event_data["responseId"] item = event_data.get("item") if not item: continue + if item.get("responseId") and not acc.response_id: + acc.response_id = item["responseId"] role = item.get("role") content: dict[str, Any] = item.get("content") or {} ctype = content.get("type") @@ -227,10 +234,13 @@ def parse_sse_lines(lines: Iterable[str]) -> ChatResult: class ChatClient: """Single-turn AI chat client over the GoodData AI conversation endpoints.""" - def __init__(self, host: str, token: str, workspace_id: str, *, timeout: float = 300.0): + def __init__( + self, host: str, token: str, workspace_id: str, *, timeout: float = 300.0, preserve_failed: bool = False + ): self._base = f"{host.rstrip('/')}/api/v1/ai/workspaces/{workspace_id}/chat/conversations" self._auth = {"Authorization": f"Bearer {token}"} self._client = httpx.Client(timeout=timeout) + self._preserve_failed = preserve_failed def create_conversation(self) -> str: def _do() -> str: @@ -264,12 +274,26 @@ def _do() -> ChatResult: return _retry_transient(_do, is_retryable=_is_retryable_exc) def ask(self, item: DatasetItem) -> ChatResult: - """Run one single-turn conversation: create, send, parse, clean up.""" + """Run one conversation: create, send, parse, clean up. + + The conversation_id is attached to the returned ChatResult for tracing. + When ``preserve_failed`` is set, failed conversations are kept on the + server so they can be inspected after the run; the conversation_id is + attached to the raised exception as well. + """ conversation_id = self.create_conversation() + success = False try: - return self.send_message(conversation_id, item.question) + result = self.send_message(conversation_id, item.question) + result.conversation_id = conversation_id + success = True + return result + except Exception as exc: + exc.conversation_id = conversation_id # type: ignore[attr-defined] + raise finally: - self.delete_conversation(conversation_id) + if success or not self._preserve_failed: + self.delete_conversation(conversation_id) def close(self) -> None: self._client.close() diff --git a/packages/gooddata-eval/src/gooddata_eval/core/config.py b/packages/gooddata-eval/src/gooddata_eval/core/config.py index bb794fda5..277785176 100644 --- a/packages/gooddata-eval/src/gooddata_eval/core/config.py +++ b/packages/gooddata-eval/src/gooddata_eval/core/config.py @@ -19,3 +19,4 @@ class RunConfig: log_to_langfuse: bool = False quiet: bool = False kind: str = "visualization" + preserve_failed: bool = False diff --git a/packages/gooddata-eval/src/gooddata_eval/core/models.py b/packages/gooddata-eval/src/gooddata_eval/core/models.py index c831c3622..981000e1a 100644 --- a/packages/gooddata-eval/src/gooddata_eval/core/models.py +++ b/packages/gooddata-eval/src/gooddata_eval/core/models.py @@ -94,6 +94,8 @@ class ChatResult(BaseModel): created_visualizations: CreatedVisualizations | None = Field(default=None, alias="createdVisualizations") tool_call_events: list[ToolCallEvent] = Field(default_factory=list, alias="toolCallEvents") reasoning_step_count: int = Field(default=0, alias="reasoningStepCount") + conversation_id: str | None = Field(default=None, alias="conversationId") + response_id: str | None = Field(default=None, alias="responseId") class SummaryInput(BaseModel): diff --git a/packages/gooddata-eval/src/gooddata_eval/core/runner.py b/packages/gooddata-eval/src/gooddata_eval/core/runner.py index 314ce7cea..98e066418 100644 --- a/packages/gooddata-eval/src/gooddata_eval/core/runner.py +++ b/packages/gooddata-eval/src/gooddata_eval/core/runner.py @@ -123,7 +123,8 @@ def _run_one_item( if on_run_done is not None: on_run_done(run_index, runs, evaluation.passed, latency) except Exception as e: # agent/network/parse failure for this item - report.error = f"{type(e).__name__}: {e}" + conv_id = getattr(e, "conversation_id", None) + report.error = f"{type(e).__name__}: {e}" + (f" [conversation_id={conv_id}]" if conv_id else "") if best is not None: report.best_detail = best.detail return report diff --git a/packages/gooddata-eval/tests/test_cli.py b/packages/gooddata-eval/tests/test_cli.py index 04063d880..e505fd8f2 100644 --- a/packages/gooddata-eval/tests/test_cli.py +++ b/packages/gooddata-eval/tests/test_cli.py @@ -512,6 +512,62 @@ def test_cli_rejects_zero_concurrency(monkeypatch, fixtures_dir): assert exit_code == 2 +def test_cli_preserve_failed_flag_parsed(monkeypatch, fixtures_dir): + """--preserve-failed sets preserve_failed=True in RunConfig and is passed to ChatClient.""" + monkeypatch.setattr(cli_main, "resolve_connection", lambda host, token, profile: ("https://h", "tok")) + captured_kwargs: dict = {} + + class _FakeController: + def __init__(self, *a, **k): ... + def get_active(self): + return ActiveLlmProvider(provider_id="p", default_model_id="gpt-5.2") + + def resolve_and_activate(self, requested, provider=None): + return ResolvedModel(provider_id="p", model_id="gpt-5.2", switched=False, provider_name="P") + + def restore(self, original): ... + def close(self): ... + + monkeypatch.setattr(cli_main, "WorkspaceModelController", _FakeController) + + original_chat_client = cli_main.ChatClient + + def _capture_chat_client(**kwargs): + captured_kwargs.update(kwargs) + return object() + + monkeypatch.setattr(cli_main, "ChatClient", _capture_chat_client) + + def _fake_run(items, backend, *, runs, model, workspace_id, **kw): + return EvalReport( + model=model, + workspace_id=workspace_id, + items=[ + ItemReport(id="i1", dataset_name="d", test_kind="visualization", question="q", pass_at_k=True, runs=1) + ], + ) + + monkeypatch.setattr(cli_main, "run_items", _fake_run) + + exit_code = cli_main.main( + [ + "run", + "--host", + "https://h", + "--token", + "tok", + "--workspace", + "ws1", + "--dataset", + str(fixtures_dir / "sample_dataset"), + "--preserve-failed", + "--quiet", + ] + ) + assert exit_code == 0 + assert captured_kwargs.get("preserve_failed") is True + + def test_cli_rejects_negative_concurrency(monkeypatch, fixtures_dir): monkeypatch.setattr(cli_main, "resolve_connection", lambda host, token, profile: ("https://h", "tok")) exit_code = cli_main.main( diff --git a/packages/gooddata-eval/tests/test_runner.py b/packages/gooddata-eval/tests/test_runner.py index a1cb8acba..a3f1742b1 100644 --- a/packages/gooddata-eval/tests/test_runner.py +++ b/packages/gooddata-eval/tests/test_runner.py @@ -233,6 +233,31 @@ def on_done(index: int, total: int, report: ItemReport) -> None: assert sorted(done_ids) == [f"c{i}" for i in range(5)] +def test_run_items_error_report_includes_conversation_id(): + """When an exception has a conversation_id, it appears in the error report.""" + + class _ConvIdBackend: + def ask(self, item: DatasetItem) -> ChatResult: + exc = RuntimeError("agent error") + exc.conversation_id = "conv-xyz" # type: ignore[attr-defined] + raise exc + + report = run_items([_item()], _ConvIdBackend(), runs=1) + assert report.errored == 1 + assert "conversation_id=conv-xyz" in report.items[0].error + + +def test_run_items_error_report_omits_conversation_id_when_absent(): + """When no conversation_id is set, the error string has no bracket suffix.""" + + class _PlainBackend: + def ask(self, item: DatasetItem) -> ChatResult: + raise RuntimeError("plain error") + + report = run_items([_item()], _PlainBackend(), runs=1) + assert "conversation_id" not in report.items[0].error + + def test_run_items_callback_exception_is_logged_not_swallowed(capsys): """A raising callback prints a traceback to stderr but the run continues.""" backend = _FakeBackend([_chat_with(_viz_obj())] * 2) diff --git a/packages/gooddata-eval/tests/test_sse_client.py b/packages/gooddata-eval/tests/test_sse_client.py index 84f0bfa5b..06cc1c534 100644 --- a/packages/gooddata-eval/tests/test_sse_client.py +++ b/packages/gooddata-eval/tests/test_sse_client.py @@ -222,3 +222,128 @@ def test_float_env_uses_default_when_unset(monkeypatch): def test_float_env_reads_override(monkeypatch): monkeypatch.setenv("GD_TEST_FLOAT", "1.5") assert sse_mod._float_env("GD_TEST_FLOAT", 5.0) == 1.5 + + +def test_parse_sse_lines_captures_response_id_from_event_data(): + """response_id is captured from the top-level event_data.""" + lines = [ + 'data: {"responseId": "resp-123", "item": {"role": "assistant", "content": {"type": "text", "text": "hi"}}}', + ] + result = parse_sse_lines(lines) + assert result.response_id == "resp-123" + + +def test_parse_sse_lines_captures_response_id_from_item(): + """response_id is captured from item when not present at top level.""" + lines = [ + 'data: {"item": {"role": "assistant", "responseId": "resp-456", "content": {"type": "text", "text": "hi"}}}', + ] + result = parse_sse_lines(lines) + assert result.response_id == "resp-456" + + +def test_parse_sse_lines_first_response_id_wins(): + """Only the first responseId encountered is kept.""" + lines = [ + 'data: {"responseId": "first", "item": {"role": "assistant", "content": {"type": "text", "text": "a"}}}', + 'data: {"responseId": "second", "item": {"role": "assistant", "content": {"type": "text", "text": "b"}}}', + ] + result = parse_sse_lines(lines) + assert result.response_id == "first" + + +def test_ask_attaches_conversation_id_to_result(monkeypatch): + """ChatClient.ask() sets conversation_id on the returned ChatResult.""" + calls = [] + + def handler(request): + if request.method == "POST" and request.url.path.endswith("/conversations"): + return httpx.Response(200, json={"conversationId": "conv-abc"}) + if request.method == "POST" and "messages" in str(request.url): + return httpx.Response(200, content=_OK_SSE) + if request.method == "DELETE": + calls.append("delete") + return httpx.Response(204) + return httpx.Response(404) + + client = _client_with_handler(handler) + from gooddata_eval.core.models import DatasetItem + + item = DatasetItem(id="t1", dataset_name="d", test_kind="visualization", question="q", expected_output={}) + result = client.ask(item) + assert result.conversation_id == "conv-abc" + assert "delete" in calls # conversation cleaned up on success + + +def test_ask_preserve_failed_keeps_conversation_on_error(monkeypatch): + """With preserve_failed=True, failed conversations are not deleted.""" + calls = [] + + def handler(request): + if request.method == "POST" and request.url.path.endswith("/conversations"): + return httpx.Response(200, json={"conversationId": "conv-fail"}) + if request.method == "POST" and "messages" in str(request.url): + return httpx.Response(200, content=_NONRETRY_SSE) + if request.method == "DELETE": + calls.append("delete") + return httpx.Response(204) + return httpx.Response(404) + + monkeypatch.setattr(sse_mod.time, "sleep", lambda s: None) + client = ChatClient(host="https://example.invalid", token="t", workspace_id="w", preserve_failed=True) + client._client = httpx.Client(transport=httpx.MockTransport(handler)) + + from gooddata_eval.core.models import DatasetItem + + item = DatasetItem(id="t1", dataset_name="d", test_kind="visualization", question="q", expected_output={}) + with pytest.raises(ChatError): + client.ask(item) + assert "delete" not in calls # conversation preserved + + +def test_ask_without_preserve_failed_deletes_on_error(monkeypatch): + """Without preserve_failed, conversations are deleted even on error.""" + calls = [] + + def handler(request): + if request.method == "POST" and request.url.path.endswith("/conversations"): + return httpx.Response(200, json={"conversationId": "conv-del"}) + if request.method == "POST" and "messages" in str(request.url): + return httpx.Response(200, content=_NONRETRY_SSE) + if request.method == "DELETE": + calls.append("delete") + return httpx.Response(204) + return httpx.Response(404) + + monkeypatch.setattr(sse_mod.time, "sleep", lambda s: None) + client = _client_with_handler(handler) + + from gooddata_eval.core.models import DatasetItem + + item = DatasetItem(id="t1", dataset_name="d", test_kind="visualization", question="q", expected_output={}) + with pytest.raises(ChatError): + client.ask(item) + assert "delete" in calls # conversation deleted + + +def test_ask_attaches_conversation_id_to_exception(monkeypatch): + """On failure, conversation_id is attached to the raised exception.""" + + def handler(request): + if request.method == "POST" and request.url.path.endswith("/conversations"): + return httpx.Response(200, json={"conversationId": "conv-exc"}) + if request.method == "POST" and "messages" in str(request.url): + return httpx.Response(200, content=_NONRETRY_SSE) + if request.method == "DELETE": + return httpx.Response(204) + return httpx.Response(404) + + monkeypatch.setattr(sse_mod.time, "sleep", lambda s: None) + client = _client_with_handler(handler) + + from gooddata_eval.core.models import DatasetItem + + item = DatasetItem(id="t1", dataset_name="d", test_kind="visualization", question="q", expected_output={}) + with pytest.raises(ChatError) as ei: + client.ask(item) + assert ei.value.conversation_id == "conv-exc"