diff --git a/packages/gooddata-eval/src/gooddata_eval/cli/main.py b/packages/gooddata-eval/src/gooddata_eval/cli/main.py
index d243bcb2a..336ec298f 100644
--- a/packages/gooddata-eval/src/gooddata_eval/cli/main.py
+++ b/packages/gooddata-eval/src/gooddata_eval/cli/main.py
@@ -97,6 +97,12 @@ def _build_parser() -> argparse.ArgumentParser:
     )
     run.add_argument("--json", dest="json_path", help="Write a JSON report to this path.")
     run.add_argument("--quiet", action="store_true", help="Suppress per-item progress output.")
+    run.add_argument(
+        "--preserve-failed",
+        action="store_true",
+        dest="preserve_failed",
+        help="Keep failed conversations on the server for post-mortem inspection.",
+    )
     run.add_argument(
         "--langfuse",
         action="store_true",
@@ -319,7 +325,12 @@ def on_langfuse_item_done(
 
             # --- non-agentic items (single-turn, use Evaluator) ---
             backend = _RoutingBackend(
-                ChatClient(host=config.host, token=config.token, workspace_id=config.workspace_id),
+                ChatClient(
+                    host=config.host,
+                    token=config.token,
+                    workspace_id=config.workspace_id,
+                    preserve_failed=config.preserve_failed,
+                ),
                 SummaryClient(host=config.host, token=config.token, workspace_id=config.workspace_id),
             )
             try:
@@ -409,6 +420,7 @@ def main(argv: list[str] | None = None) -> int:
             log_to_langfuse=args.langfuse,
             quiet=args.quiet,
             kind=args.kind,
+            preserve_failed=args.preserve_failed,
         )
         return _run(config)
     except (
diff --git a/packages/gooddata-eval/src/gooddata_eval/core/chat/sse_client.py b/packages/gooddata-eval/src/gooddata_eval/core/chat/sse_client.py
index 9d0056c4c..4d3f93f9e 100644
--- a/packages/gooddata-eval/src/gooddata_eval/core/chat/sse_client.py
+++ b/packages/gooddata-eval/src/gooddata_eval/core/chat/sse_client.py
@@ -106,6 +106,7 @@ class _SseAccumulator:
     call_id_to_event_index: dict[str, int] = field(default_factory=dict)
     reasoning_steps: list[dict[str, Any]] = field(default_factory=list)
     adhoc_viz_args: list[dict[str, Any]] = field(default_factory=list)
+    response_id: str | None = None
 
 
 def _handle_text(content: dict[str, Any], acc: _SseAccumulator) -> None:
@@ -176,7 +177,9 @@ def _build_chat_result(acc: _SseAccumulator) -> ChatResult:
             "objects": [acc.adhoc_viz_args[-1]],
             "reasoning": "\n".join(acc.viz_reasoning_parts),
         }
-    return ChatResult.model_validate(payload)
+    result = ChatResult.model_validate(payload)
+    result.response_id = acc.response_id
+    return result
 
 
 def parse_sse_lines(lines: Iterable[str]) -> ChatResult:
@@ -204,9 +207,13 @@ def parse_sse_lines(lines: Iterable[str]) -> ChatResult:
             if code in _RETRYABLE_STATUS_CODES:
                 raise TransientChatError(message, status_code=code, detail=detail)
             raise ChatError(message, status_code=code, detail=detail)
+        if event_data.get("responseId") and not acc.response_id:
+            acc.response_id = event_data["responseId"]
         item = event_data.get("item")
         if not item:
             continue
+        if item.get("responseId") and not acc.response_id:
+            acc.response_id = item["responseId"]
         role = item.get("role")
         content: dict[str, Any] = item.get("content") or {}
         ctype = content.get("type")
@@ -227,10 +234,13 @@ def parse_sse_lines(lines: Iterable[str]) -> ChatResult:
 class ChatClient:
     """Single-turn AI chat client over the GoodData AI conversation endpoints."""
 
-    def __init__(self, host: str, token: str, workspace_id: str, *, timeout: float = 300.0):
+    def __init__(
+        self, host: str, token: str, workspace_id: str, *, timeout: float = 300.0, preserve_failed: bool = False
+    ):
         self._base = f"{host.rstrip('/')}/api/v1/ai/workspaces/{workspace_id}/chat/conversations"
         self._auth = {"Authorization": f"Bearer {token}"}
         self._client = httpx.Client(timeout=timeout)
+        self._preserve_failed = preserve_failed
 
     def create_conversation(self) -> str:
         def _do() -> str:
@@ -264,12 +274,26 @@ def _do() -> ChatResult:
         return _retry_transient(_do, is_retryable=_is_retryable_exc)
 
     def ask(self, item: DatasetItem) -> ChatResult:
-        """Run one single-turn conversation: create, send, parse, clean up."""
+        """Run one conversation: create, send, parse, clean up.
+
+        The conversation_id is attached to the returned ChatResult for tracing.
+        When ``preserve_failed`` is set, failed conversations are kept on the
+        server so they can be inspected after the run; the conversation_id is
+        attached to the raised exception as well.
+        """
         conversation_id = self.create_conversation()
+        success = False
         try:
-            return self.send_message(conversation_id, item.question)
+            result = self.send_message(conversation_id, item.question)
+            result.conversation_id = conversation_id
+            success = True
+            return result
+        except Exception as exc:
+            exc.conversation_id = conversation_id  # type: ignore[attr-defined]
+            raise
         finally:
-            self.delete_conversation(conversation_id)
+            if success or not self._preserve_failed:
+                self.delete_conversation(conversation_id)
 
     def close(self) -> None:
         self._client.close()
diff --git a/packages/gooddata-eval/src/gooddata_eval/core/config.py b/packages/gooddata-eval/src/gooddata_eval/core/config.py
index bb794fda5..277785176 100644
--- a/packages/gooddata-eval/src/gooddata_eval/core/config.py
+++ b/packages/gooddata-eval/src/gooddata_eval/core/config.py
@@ -19,3 +19,4 @@ class RunConfig:
     log_to_langfuse: bool = False
     quiet: bool = False
     kind: str = "visualization"
+    preserve_failed: bool = False
diff --git a/packages/gooddata-eval/src/gooddata_eval/core/models.py b/packages/gooddata-eval/src/gooddata_eval/core/models.py
index c831c3622..981000e1a 100644
--- a/packages/gooddata-eval/src/gooddata_eval/core/models.py
+++ b/packages/gooddata-eval/src/gooddata_eval/core/models.py
@@ -94,6 +94,8 @@ class ChatResult(BaseModel):
     created_visualizations: CreatedVisualizations | None = Field(default=None, alias="createdVisualizations")
     tool_call_events: list[ToolCallEvent] = Field(default_factory=list, alias="toolCallEvents")
     reasoning_step_count: int = Field(default=0, alias="reasoningStepCount")
+    conversation_id: str | None = Field(default=None, alias="conversationId")
+    response_id: str | None = Field(default=None, alias="responseId")
 
 
 class SummaryInput(BaseModel):
diff --git a/packages/gooddata-eval/src/gooddata_eval/core/runner.py b/packages/gooddata-eval/src/gooddata_eval/core/runner.py
index 314ce7cea..98e066418 100644
--- a/packages/gooddata-eval/src/gooddata_eval/core/runner.py
+++ b/packages/gooddata-eval/src/gooddata_eval/core/runner.py
@@ -123,7 +123,8 @@ def _run_one_item(
             if on_run_done is not None:
                 on_run_done(run_index, runs, evaluation.passed, latency)
     except Exception as e:  # agent/network/parse failure for this item
-        report.error = f"{type(e).__name__}: {e}"
+        conv_id = getattr(e, "conversation_id", None)
+        report.error = f"{type(e).__name__}: {e}" + (f" [conversation_id={conv_id}]" if conv_id else "")
         if best is not None:
             report.best_detail = best.detail
         return report
diff --git a/packages/gooddata-eval/tests/test_cli.py b/packages/gooddata-eval/tests/test_cli.py
index 04063d880..e505fd8f2 100644
--- a/packages/gooddata-eval/tests/test_cli.py
+++ b/packages/gooddata-eval/tests/test_cli.py
@@ -512,6 +512,62 @@ def test_cli_rejects_zero_concurrency(monkeypatch, fixtures_dir):
     assert exit_code == 2
 
 
+def test_cli_preserve_failed_flag_parsed(monkeypatch, fixtures_dir):
+    """--preserve-failed sets preserve_failed=True in RunConfig and is passed to ChatClient."""
+    monkeypatch.setattr(cli_main, "resolve_connection", lambda host, token, profile: ("https://h", "tok"))
+    captured_kwargs: dict = {}
+
+    class _FakeController:
+        def __init__(self, *a, **k): ...
+        def get_active(self):
+            return ActiveLlmProvider(provider_id="p", default_model_id="gpt-5.2")
+
+        def resolve_and_activate(self, requested, provider=None):
+            return ResolvedModel(provider_id="p", model_id="gpt-5.2", switched=False, provider_name="P")
+
+        def restore(self, original): ...
+        def close(self): ...
+
+    monkeypatch.setattr(cli_main, "WorkspaceModelController", _FakeController)
+
+    original_chat_client = cli_main.ChatClient
+
+    def _capture_chat_client(**kwargs):
+        captured_kwargs.update(kwargs)
+        return object()
+
+    monkeypatch.setattr(cli_main, "ChatClient", _capture_chat_client)
+
+    def _fake_run(items, backend, *, runs, model, workspace_id, **kw):
+        return EvalReport(
+            model=model,
+            workspace_id=workspace_id,
+            items=[
+                ItemReport(id="i1", dataset_name="d", test_kind="visualization", question="q", pass_at_k=True, runs=1)
+            ],
+        )
+
+    monkeypatch.setattr(cli_main, "run_items", _fake_run)
+
+    exit_code = cli_main.main(
+        [
+            "run",
+            "--host",
+            "https://h",
+            "--token",
+            "tok",
+            "--workspace",
+            "ws1",
+            "--dataset",
+            str(fixtures_dir / "sample_dataset"),
+            "--preserve-failed",
+            "--quiet",
+        ]
+    )
+    assert exit_code == 0
+    assert captured_kwargs.get("preserve_failed") is True
+
+
 def test_cli_rejects_negative_concurrency(monkeypatch, fixtures_dir):
     monkeypatch.setattr(cli_main, "resolve_connection", lambda host, token, profile: ("https://h", "tok"))
     exit_code = cli_main.main(
diff --git a/packages/gooddata-eval/tests/test_runner.py b/packages/gooddata-eval/tests/test_runner.py
index a1cb8acba..a3f1742b1 100644
--- a/packages/gooddata-eval/tests/test_runner.py
+++ b/packages/gooddata-eval/tests/test_runner.py
@@ -233,6 +233,31 @@ def on_done(index: int, total: int, report: ItemReport) -> None:
     assert sorted(done_ids) == [f"c{i}" for i in range(5)]
 
 
+def test_run_items_error_report_includes_conversation_id():
+    """When an exception has a conversation_id, it appears in the error report."""
+
+    class _ConvIdBackend:
+        def ask(self, item: DatasetItem) -> ChatResult:
+            exc = RuntimeError("agent error")
+            exc.conversation_id = "conv-xyz"  # type: ignore[attr-defined]
+            raise exc
+
+    report = run_items([_item()], _ConvIdBackend(), runs=1)
+    assert report.errored == 1
+    assert "conversation_id=conv-xyz" in report.items[0].error
+
+
+def test_run_items_error_report_omits_conversation_id_when_absent():
+    """When no conversation_id is set, the error string has no bracket suffix."""
+
+    class _PlainBackend:
+        def ask(self, item: DatasetItem) -> ChatResult:
+            raise RuntimeError("plain error")
+
+    report = run_items([_item()], _PlainBackend(), runs=1)
+    assert "conversation_id" not in report.items[0].error
+
+
 def test_run_items_callback_exception_is_logged_not_swallowed(capsys):
     """A raising callback prints a traceback to stderr but the run continues."""
     backend = _FakeBackend([_chat_with(_viz_obj())] * 2)
diff --git a/packages/gooddata-eval/tests/test_sse_client.py b/packages/gooddata-eval/tests/test_sse_client.py
index 84f0bfa5b..06cc1c534 100644
--- a/packages/gooddata-eval/tests/test_sse_client.py
+++ b/packages/gooddata-eval/tests/test_sse_client.py
@@ -222,3 +222,128 @@ def test_float_env_uses_default_when_unset(monkeypatch):
 def test_float_env_reads_override(monkeypatch):
     monkeypatch.setenv("GD_TEST_FLOAT", "1.5")
     assert sse_mod._float_env("GD_TEST_FLOAT", 5.0) == 1.5
+
+
+def test_parse_sse_lines_captures_response_id_from_event_data():
+    """response_id is captured from the top-level event_data."""
+    lines = [
+        'data: {"responseId": "resp-123", "item": {"role": "assistant", "content": {"type": "text", "text": "hi"}}}',
+    ]
+    result = parse_sse_lines(lines)
+    assert result.response_id == "resp-123"
+
+
+def test_parse_sse_lines_captures_response_id_from_item():
+    """response_id is captured from item when not present at top level."""
+    lines = [
+        'data: {"item": {"role": "assistant", "responseId": "resp-456", "content": {"type": "text", "text": "hi"}}}',
+    ]
+    result = parse_sse_lines(lines)
+    assert result.response_id == "resp-456"
+
+
+def test_parse_sse_lines_first_response_id_wins():
+    """Only the first responseId encountered is kept."""
+    lines = [
+        'data: {"responseId": "first", "item": {"role": "assistant", "content": {"type": "text", "text": "a"}}}',
+        'data: {"responseId": "second", "item": {"role": "assistant", "content": {"type": "text", "text": "b"}}}',
+    ]
+    result = parse_sse_lines(lines)
+    assert result.response_id == "first"
+
+
+def test_ask_attaches_conversation_id_to_result(monkeypatch):
+    """ChatClient.ask() sets conversation_id on the returned ChatResult."""
+    calls = []
+
+    def handler(request):
+        if request.method == "POST" and request.url.path.endswith("/conversations"):
+            return httpx.Response(200, json={"conversationId": "conv-abc"})
+        if request.method == "POST" and "messages" in str(request.url):
+            return httpx.Response(200, content=_OK_SSE)
+        if request.method == "DELETE":
+            calls.append("delete")
+            return httpx.Response(204)
+        return httpx.Response(404)
+
+    client = _client_with_handler(handler)
+    from gooddata_eval.core.models import DatasetItem
+
+    item = DatasetItem(id="t1", dataset_name="d", test_kind="visualization", question="q", expected_output={})
+    result = client.ask(item)
+    assert result.conversation_id == "conv-abc"
+    assert "delete" in calls  # conversation cleaned up on success
+
+
+def test_ask_preserve_failed_keeps_conversation_on_error(monkeypatch):
+    """With preserve_failed=True, failed conversations are not deleted."""
+    calls = []
+
+    def handler(request):
+        if request.method == "POST" and request.url.path.endswith("/conversations"):
+            return httpx.Response(200, json={"conversationId": "conv-fail"})
+        if request.method == "POST" and "messages" in str(request.url):
+            return httpx.Response(200, content=_NONRETRY_SSE)
+        if request.method == "DELETE":
+            calls.append("delete")
+            return httpx.Response(204)
+        return httpx.Response(404)
+
+    monkeypatch.setattr(sse_mod.time, "sleep", lambda s: None)
+    client = ChatClient(host="https://example.invalid", token="t", workspace_id="w", preserve_failed=True)
+    client._client = httpx.Client(transport=httpx.MockTransport(handler))
+
+    from gooddata_eval.core.models import DatasetItem
+
+    item = DatasetItem(id="t1", dataset_name="d", test_kind="visualization", question="q", expected_output={})
+    with pytest.raises(ChatError):
+        client.ask(item)
+    assert "delete" not in calls  # conversation preserved
+
+
+def test_ask_without_preserve_failed_deletes_on_error(monkeypatch):
+    """Without preserve_failed, conversations are deleted even on error."""
+    calls = []
+
+    def handler(request):
+        if request.method == "POST" and request.url.path.endswith("/conversations"):
+            return httpx.Response(200, json={"conversationId": "conv-del"})
+        if request.method == "POST" and "messages" in str(request.url):
+            return httpx.Response(200, content=_NONRETRY_SSE)
+        if request.method == "DELETE":
+            calls.append("delete")
+            return httpx.Response(204)
+        return httpx.Response(404)
+
+    monkeypatch.setattr(sse_mod.time, "sleep", lambda s: None)
+    client = _client_with_handler(handler)
+
+    from gooddata_eval.core.models import DatasetItem
+
+    item = DatasetItem(id="t1", dataset_name="d", test_kind="visualization", question="q", expected_output={})
+    with pytest.raises(ChatError):
+        client.ask(item)
+    assert "delete" in calls  # conversation deleted
+
+
+def test_ask_attaches_conversation_id_to_exception(monkeypatch):
+    """On failure, conversation_id is attached to the raised exception."""
+
+    def handler(request):
+        if request.method == "POST" and request.url.path.endswith("/conversations"):
+            return httpx.Response(200, json={"conversationId": "conv-exc"})
+        if request.method == "POST" and "messages" in str(request.url):
+            return httpx.Response(200, content=_NONRETRY_SSE)
+        if request.method == "DELETE":
+            return httpx.Response(204)
+        return httpx.Response(404)
+
+    monkeypatch.setattr(sse_mod.time, "sleep", lambda s: None)
+    client = _client_with_handler(handler)
+
+    from gooddata_eval.core.models import DatasetItem
+
+    item = DatasetItem(id="t1", dataset_name="d", test_kind="visualization", question="q", expected_output={})
+    with pytest.raises(ChatError) as ei:
+        client.ask(item)
+    assert ei.value.conversation_id == "conv-exc"