fix(review): resolve 11 P1 blockers from ce-code-review

P1#1 config_driven: propagate trace_outcome into output_data so lifecycle._is_failure_path() detects non-success outcomes P1#2 portal: route through ConfigDrivenAgent.execute_stream (not react_engine.execute_stream directly) so evolution hooks fire and trace_outcome propagates; add pre-built messages support in _build_llm_messages P1#3 sandbox: make network_block reentrant via module-level reference counter + threading.Lock - concurrent VERIFICATION phases no longer permanently block all new connections P1#4 chat: replace dead isinstance(_PlanExecEngine) check with hasattr(_spec_review_handler) to wire the spec review gate P1#5 plan_exec_engine: complete max_reflections threading chain (PlanExecEngine + ReActStepExecutor constructors) P1#6 plan_exec_engine: enforce phase budgets (max_steps from phase_budgets, not hardcoded 5) P1#7 plan_exec_engine: use current plan (not stale plan var) in aggregation after replan P1#8 plan_exec_engine: map failure to failed status (not success) P1#9 app: add drain timeout for pending evolution tasks on shutdown P1#10 portal: handle spec_review_reply in WS handler P1#11 chat: persist spec_review_request/reply/timeout to conversation store so reload can reconstruct gate state Tests: 116 related tests pass; 26 pre-existing failures unchanged (stash-verified). ruff lint clean.
2026-07-04 01:10:01 +08:00 · 2026-07-04 01:10:01 +08:00 · e5e76697a9
parent 7c900ce280
commit e5e76697a9
9 changed files with 355 additions and 163 deletions
--- a/src/agentkit/core/config_driven.py
+++ b/src/agentkit/core/config_driven.py
@ -72,6 +72,11 @@ async def drain_pending_evolution_tasks() -> None:
    await asyncio.gather(*_pending_evolution_tasks, return_exceptions=True)
 def get_evolution_dropped_count() -> int:
    """Return the number of evolution tasks dropped due to backpressure."""
    return _evolution_dropped_count
 class AgentConfig:
    """Agent 配置模型，从 YAML 或 Dict 构建"""
@ -739,7 +744,20 @@ class ConfigDrivenAgent(BaseAgent, EvolutionMixin):
        Shared by all _handle_*_stream methods to avoid duplicating the
        message-rendering logic that mirrors the sync _handle_* methods.
        Portal path: if ``task.input_data["messages"]`` is present (a list of
        ``{role, content}`` dicts), use those pre-built messages directly
        instead of rendering the prompt template. This lets the portal route
        through ``execute_stream`` (inheriting evolution hooks + trace_outcome
        propagation) while keeping its external message-building logic.
        """
        prebuilt = task.input_data.get("messages")
        if prebuilt is not None:
            system_prompt = task.input_data.get("system_prompt")
            user_messages = [m for m in prebuilt if m.get("role") != "system"]
            if not user_messages:
                user_messages = [{"role": "user", "content": str(task.input_data)}]
            return system_prompt, user_messages
        variables = task.input_data.copy()
        variables["task_type"] = task.task_type
        if self._prompt_template:
@ -774,22 +792,35 @@ class ConfigDrivenAgent(BaseAgent, EvolutionMixin):
        token = CancellationToken()
        self._active_tokens[task.task_id] = token
        _stream_output: dict = {}
        _stream_trace_outcome: str = "success"
        _stream_error: BaseException | None = None
        _stream_completed = False
        _stream_started_at = datetime.now(timezone.utc)
        try:
            await self._register_mcp_tools()
            async for event in self.handle_task_stream(task):
                if event.event_type == "final_answer":
                    _raw = event.data.get("output", "")
                    _stream_output = {"content": _raw} if isinstance(_raw, str) else _raw
                    # PLAN_EXEC path may embed trace_outcome in final_answer.
                    _to = event.data.get("trace_outcome")
                    if _to:
                        _stream_trace_outcome = _to
                elif event.event_type == "final_result":
                    # REACT path: final_result carries ReActResult.status.
                    _result = event.data.get("result")
                    if _result is not None:
                        _stream_trace_outcome = getattr(_result, "status", "success")
                yield event
            _stream_completed = True
        except asyncio.CancelledError as ce:
            # Cancellation must propagate, but hooks still fire (U2 edge case).
            _stream_error = ce
            _stream_trace_outcome = "cancelled"
            raise
        except Exception as e:
            _stream_error = e
            _stream_trace_outcome = "error"
            raise
        finally:
            # async generator 的 finally 在 generator 关闭时执行（GC/aclose/正常结束）
@ -797,6 +828,12 @@ class ConfigDrivenAgent(BaseAgent, EvolutionMixin):
            # KTD-4: lifecycle parity — fire evolution hooks fire-and-forget.
            try:
                now = datetime.now(timezone.utc)
                # KTD-8: propagate trace_outcome into output_data so
                # lifecycle._is_failure_path() can detect non-success outcomes.
                if _stream_output:
                    _stream_output["trace_outcome"] = _stream_trace_outcome
                else:
                    _stream_output = {"trace_outcome": _stream_trace_outcome}
                if _stream_error is not None:
                    if isinstance(_stream_error, (asyncio.CancelledError, TaskCancelledError)):
                        status = TaskStatus.CANCELLED
@ -810,17 +847,29 @@ class ConfigDrivenAgent(BaseAgent, EvolutionMixin):
                        status=status,
                        output_data=None,
                        error_message=err_msg,
-                        started_at=now,
+                        started_at=_stream_started_at,
                        completed_at=now,
                    )
                elif _stream_completed:
                    # KTD-8: map non-success trace_outcomes to FAILED.
                    if _stream_trace_outcome in (
                        "gave_up_after_reflections",
                        "verify_failed",
                        "verify_quota_exhausted",
                        "failed",
                    ):
                        status = TaskStatus.FAILED
                        err_msg = _stream_trace_outcome
                    else:
                        status = TaskStatus.COMPLETED
                        err_msg = None
                    result = TaskResult(
                        task_id=task.task_id,
                        agent_name=self.name,
-                        status=TaskStatus.COMPLETED,
+                        status=status,
                        output_data=_stream_output,
-                        error_message=None,
+                        error_message=err_msg,
-                        started_at=now,
+                        started_at=_stream_started_at,
                        completed_at=now,
                    )
                else:
@ -831,7 +880,7 @@ class ConfigDrivenAgent(BaseAgent, EvolutionMixin):
                        status=TaskStatus.CANCELLED,
                        output_data=None,
                        error_message="stream closed before completion",
-                        started_at=now,
+                        started_at=_stream_started_at,
                        completed_at=now,
                    )
                self._trigger_evolution_hooks(task, result)
--- a/src/agentkit/core/plan_exec_engine.py
+++ b/src/agentkit/core/plan_exec_engine.py
@ -121,6 +121,10 @@ class PlanExecEngine:
        # user's decision. None = skip the gate (backward compat — the engine
        # proceeds directly to execution after Spec persistence).
        spec_review_handler: SpecReviewHandler | None = None,
        # KTD-2/R4: max reflections for ReActEngine reinjection→reflection
        # escalation. Threaded through to each step's ReActEngine so the
        # verify-failed path can escalate from reinjection to full reflection.
        max_reflections: int = 2,
    ):
        """
        Args:
@ -159,6 +163,8 @@ class PlanExecEngine:
        self._pitfall_detector = pitfall_detector
        # U8/R8: spec review gate handler. None = skip gate (backward compat).
        self._spec_review_handler = spec_review_handler
        # KTD-2/R4: max reflections threaded to each step's ReActEngine.
        self._max_reflections = max_reflections
        # U4/R11: copy the default to avoid mutating the module-level dict.
        self._phase_budgets = (
            dict(phase_budgets) if phase_budgets is not None else dict(_DEFAULT_PHASE_BUDGETS)
@ -605,9 +611,10 @@ class PlanExecEngine:
                    "output": output,
                    "total_steps": len(state.trajectory),
                    "total_tokens": state.total_tokens,
-                    "plan_id": plan.plan_id,
+                    "plan_id": current_plan.plan_id,
                    "plan_status": plan_result.status.value,
                    "replanned": state.replanned,
                    "trace_outcome": trace_outcome,
                },
            )
@ -637,7 +644,7 @@ class PlanExecEngine:
    async def _inject_pitfall_warnings(
        self,
        goal: str,
-        plan_steps: list[Any],
+        plan_steps: list[PlanStep],
        task_type: str,
        actor: str,
        system_prompt: str | None,
@ -1432,6 +1439,7 @@ class PlanExecEngine:
            verification_enabled=self._verification_enabled,
            verification_commands=self._verification_commands,
            phase_budgets=self._phase_budgets,
            max_reflections=self._max_reflections,
        )
        return PlanExecutor(
            agent_pool=step_executor,
@ -1590,11 +1598,13 @@ class ReActStepExecutor:
        model: str = "default",
        system_prompt: str | None = None,
        tools: list["Tool"] | None = None,
-        max_steps: int = 5,
+        max_steps: int = 10,
        confirmation_handler: Any | None = None,
        verification_enabled: bool = False,
        verification_commands: list[str] | None = None,
        phase_budgets: dict[str, int] | None = None,
        # KTD-2/R4: threaded through to each step's ReActEngine.
        max_reflections: int = 2,
    ):
        self._llm_gateway = llm_gateway
        self._messages = messages or []
@ -1607,6 +1617,8 @@ class ReActStepExecutor:
        self._verification_commands = verification_commands
        # U4/R11: thread through to each step's ReActEngine.
        self._phase_budgets = phase_budgets
        # KTD-2/R4: thread through to each step's ReActEngine.
        self._max_reflections = max_reflections
        self._agents: dict[str, _ReActStepAgent] = {}
    async def create_agent_from_skill(self, skill_name: str):
@ -1623,6 +1635,7 @@ class ReActStepExecutor:
            verification_enabled=self._verification_enabled,
            verification_commands=self._verification_commands,
            phase_budgets=self._phase_budgets,
            max_reflections=self._max_reflections,
        )
        self._agents[skill_name] = agent
        return agent
@ -1642,6 +1655,7 @@ class ReActStepExecutor:
            verification_enabled=self._verification_enabled,
            verification_commands=self._verification_commands,
            phase_budgets=self._phase_budgets,
            max_reflections=self._max_reflections,
        )
        self._agents[key] = agent
        return agent
@ -1662,11 +1676,12 @@ class _ReActStepAgent:
        model: str = "default",
        system_prompt: str | None = None,
        tools: list["Tool"] | None = None,
-        max_steps: int = 5,
+        max_steps: int = 10,
        confirmation_handler: Any | None = None,
        verification_enabled: bool = False,
        verification_commands: list[str] | None = None,
        phase_budgets: dict[str, int] | None = None,
        max_reflections: int = 2,
    ):
        self.name = name
        self._llm_gateway = llm_gateway
@ -1680,6 +1695,7 @@ class _ReActStepAgent:
        self._verification_commands = verification_commands
        # U4/R11: per-phase step quotas, passed to ReActEngine.
        self._phase_budgets = phase_budgets
        self._max_reflections = max_reflections
    async def execute(self, task_msg: TaskMessage) -> "TaskResult":
        """执行步骤：通过 ReActEngine 循环调用"""
@ -1710,6 +1726,7 @@ class _ReActStepAgent:
            verification_enabled=self._verification_enabled,
            verification_commands=self._verification_commands,
            phase_budgets=self._phase_budgets,
            max_reflections=self._max_reflections,
        )
        # 构建 messages
@ -1728,7 +1745,13 @@ class _ReActStepAgent:
        now = datetime.now(timezone.utc)
        status = TaskStatus.COMPLETED.value
-        if react_result.status in ("timeout", "cancelled"):
+        if react_result.status in (
            "timeout",
            "cancelled",
            "verify_failed",
            "gave_up_after_reflections",
            "failed",
        ):
            status = TaskStatus.FAILED.value
        return TaskResult(
--- a/src/agentkit/core/react.py
+++ b/src/agentkit/core/react.py
@ -33,10 +33,12 @@ from agentkit.telemetry.metrics import (
    agent_duration_histogram,
 )
 from agentkit.core.phase import PhaseState
 if TYPE_CHECKING:
    from agentkit.core.compressor import CompressionStrategy
    from agentkit.core.middleware import MiddlewareChain
-    from agentkit.core.phase import PhasePolicy, PhaseState
+    from agentkit.core.phase import PhasePolicy
    from agentkit.core.sandbox import WorkspaceSandbox
    from agentkit.core.trace import TraceRecorder
    from agentkit.evolution.pitfall_detector import PitfallWarning
@ -420,8 +422,6 @@ class ReActEngine:
        """
        if self._phase_policy is None or self._current_phase is None:
            return
        from agentkit.core.phase import PhaseState
        while self._current_phase not in (PhaseState.VERIFICATION, PhaseState.DELIVERY):
            nxt = self.advance_phase()
            if nxt is None:
@ -446,8 +446,6 @@ class ReActEngine:
        """
        if self._phase_policy is None or self._current_phase is None:
            return None
        from agentkit.core.phase import PhaseState
        nxt = PhaseState.next_of(self._current_phase)
        if nxt is None:
            # Already at DELIVERY — return None to signal no transition.
@ -890,8 +888,8 @@ class ReActEngine:
            trace_outcome = "success"
            # U4/G1: verify 失败回灌计数器。受 max_steps 上限约束(不无限循环)。
-            # U4/KTD-7: initialize from restored budget state (checkpoint resume).
+            # U4/KTD-7: _reflect_count is initialized from restored budget state
-            reinjections = self._reflect_count
+            # (checkpoint resume) and used directly — no redundant local copy.
            _loop_start = time.monotonic()
            while step < self._max_steps:
@ -913,9 +911,7 @@ class ReActEngine:
                    and self._phase_policy is not None
                    and self._current_phase is not None
                ):
-                    from agentkit.core.phase import PhaseState as _PS
+                    if self._current_phase in (PhaseState.PLANNING, PhaseState.BUILDING):
                    if self._current_phase in (_PS.PLANNING, _PS.BUILDING):
                        self._think_count += 1
                        think_quota = self._phase_budgets.get("think")
                        if think_quota is not None and self._think_count >= think_quota:
@ -1547,7 +1543,7 @@ class ReActEngine:
                                vresult = await vloop.verify()
                                if not vresult.passed:
                                    if (
-                                        reinjections < self._max_reinjections
+                                        self._reflect_count < self._max_reinjections
                                        and step < self._max_steps
                                    ):
                                        errors_text = "\n".join(vresult.errors)
@ -1557,7 +1553,6 @@ class ReActEngine:
                                                "content": (f"验证失败,错误如下:\n{errors_text}"),
                                            }
                                        )
                                        reinjections += 1
                                        # U4/R10: track reflect count for
                                        # checkpoint reconstruction (KTD-7).
                                        self._reflect_count += 1
@ -1574,7 +1569,7 @@ class ReActEngine:
                                            data={
                                                "message": (
                                                    f"验证失败,已注入错误信息让 LLM 自纠正 "
-                                                    f"(reinjection {reinjections}/{self._max_reinjections})"
+                                                    f"(reinjection {self._reflect_count}/{self._max_reinjections})"
                                                ),
                                                "verify_errors": vresult.errors,
                                            },
@ -1681,7 +1676,7 @@ class ReActEngine:
                                    logger.info(
                                        "Verification failed after %d reinjections, "
                                        "%d reflections, interrupting with verify log",
-                                        reinjections,
+                                        self._reflect_count,
                                        self._reflection_count,
                                    )
                                    break
@ -2136,7 +2131,7 @@ class ReActEngine:
        in_verification = (
            self._sandbox is not None
            and self._current_phase is not None
-            and self._current_phase.value == "verification"
+            and self._current_phase == PhaseState.VERIFICATION
        )
        try:
--- a/src/agentkit/core/sandbox.py
+++ b/src/agentkit/core/sandbox.py
@ -28,10 +28,24 @@ import contextlib
 import errno
 import logging
 import socket
 import threading
 from pathlib import Path
 logger = logging.getLogger(__name__)
 # Reentrancy counter for ``network_block``. Concurrent VERIFICATION phases
 # (parallel PLAN_EXEC steps) each enter the context manager; only the first
 # entry (0 -> 1) patches ``socket.socket.connect``, and only the last exit
 # (1 -> 0) restores it. Naive save/restore would unpatch on the first exit
 # while other phases are still expecting the block to be in effect, breaking
 # sandboxing for any phase that started later.
 # ponytail: process-wide counter — not subprocess-safe (inherited fork state
 # is irrelevant because the monkey-patch lives in the parent's socket module).
 _network_block_count: int = 0
 _network_block_lock = threading.Lock()
 _original_socket_connect = socket.socket.connect
 _original_socket_connect_ex = socket.socket.connect_ex
 class SandboxNetworkBlockedError(RuntimeError):
    """Raised when a tool attempts an outbound network call under sandbox."""
@ -115,17 +129,23 @@ class WorkspaceSandbox:
        """Block outbound network connections within the async context.
        Patches ``socket.socket.connect`` and ``connect_ex`` to raise /
-        return ``ECONNREFUSED`` respectively. Restores the originals on exit,
+        return ``ECONNREFUSED`` respectively. Restores the originals on the
-        even if the wrapped code raises.
+        last concurrent exit, even if the wrapped code raises.
        Already-connected sockets (e.g. an LLM gateway keep-alive pool) are
        unaffected — only *new* ``connect()`` calls are blocked. This is the
        correct granularity: the LLM gateway talks over its existing
        connection, while a tool trying to ``requests.get(...)`` makes a new
        connect and is rejected.
        Reentrancy: a module-level counter guards the patch. Concurrent
        VERIFICATION phases (parallel PLAN_EXEC steps) each enter/exit; the
        patch is engaged on count 0->1 and released on count 1->0. Without
        this, the first exit would restore the original connect while later
        phases are still expecting the block, terminating new LLM gateway /
        Redis / PostgreSQL connections in those phases.
        """
-        original_connect = socket.socket.connect
+        global _network_block_count  # noqa: PLW0603
        original_connect_ex = socket.socket.connect_ex
        def _blocked_connect(self_sock, *args, **kwargs):  # noqa: ANN001
            raise SandboxNetworkBlockedError(
@ -136,15 +156,26 @@ class WorkspaceSandbox:
            # connect_ex returns an errno instead of raising (POSIX contract).
            return errno.ECONNREFUSED
        with _network_block_lock:
            _network_block_count += 1
            if _network_block_count == 1:
                socket.socket.connect = _blocked_connect  # type: ignore[method-assign]
                socket.socket.connect_ex = _blocked_connect_ex  # type: ignore[method-assign]
-        logger.debug("sandbox: network block engaged")
+                logger.debug("sandbox: network block engaged (count=1)")
        try:
            yield
        finally:
-            socket.socket.connect = original_connect  # type: ignore[method-assign]
+            with _network_block_lock:
-            socket.socket.connect_ex = original_connect_ex  # type: ignore[method-assign]
+                _network_block_count -= 1
-            logger.debug("sandbox: network block released")
+                if _network_block_count == 0:
                    socket.socket.connect = _original_socket_connect  # type: ignore[method-assign]
                    socket.socket.connect_ex = _original_socket_connect_ex  # type: ignore[method-assign]
                    logger.debug("sandbox: network block released (count=0)")
                else:
                    logger.debug(
                        "sandbox: network block still held (count=%d)",
                        _network_block_count,
                    )
 def detect_verification_commands(workspace_root: str | Path | None) -> list[str]:
--- a/src/agentkit/server/app.py
+++ b/src/agentkit/server/app.py
@ -805,7 +805,14 @@ async def lifespan(app: FastAPI):
    try:
        from agentkit.core.config_driven import drain_pending_evolution_tasks
-        await drain_pending_evolution_tasks()
+        await asyncio.wait_for(drain_pending_evolution_tasks(), timeout=10.0)
    except asyncio.TimeoutError:
        from agentkit.core.config_driven import _pending_evolution_tasks
        logger.warning(
            "drain_pending_evolution_tasks 超时 10s, %d 个任务被放弃",
            len(_pending_evolution_tasks),
        )
    except Exception:
        logger.debug("drain_pending_evolution_tasks 异常已忽略", exc_info=True)
--- a/src/agentkit/server/routes/chat.py
+++ b/src/agentkit/server/routes/chat.py
@ -1494,6 +1494,23 @@ async def _handle_chat_message(
                },
            }
        )
        # U8/R8: persist the spec_review_request so it survives a page reload.
        # The frontend reconstructs the pending review card from the restored
        # message metadata (spec_review_id + goal + steps).
        try:
            await sm.append_message(
                session_id=session_id,
                role=MessageRole.ASSISTANT,
                content=f"[Spec Review] {goal}",
                metadata={
                    "message_type": "spec_review_request",
                    "spec_review_id": spec_review_id,
                    "spec_review_goal": goal,
                    "spec_review_steps": steps,
                },
            )
        except Exception:
            logger.debug("Failed to persist spec_review_request", exc_info=True)
        loop = asyncio.get_running_loop()
        future: asyncio.Future[tuple[str, str]] = loop.create_future()
@ -1506,19 +1523,58 @@ async def _handle_chat_message(
            # "failed") so the user can resume on return.
            decision, feedback = await asyncio.wait_for(future, timeout=1800.0)
            logger.info(f"Spec review {spec_review_id} resolved: decision={decision!r}")
            # Persist the decision so the frontend can show the outcome after
            # a reload (e.g. timeout→parked transition the user never saw).
            try:
                await sm.append_message(
                    session_id=session_id,
                    role=MessageRole.ASSISTANT,
                    content=f"[Spec Review Decision] {decision}: {feedback}",
                    metadata={
                        "message_type": "spec_review_reply",
                        "spec_review_id": spec_review_id,
                        "spec_review_decision": decision,
                        "spec_review_feedback": feedback,
                    },
                )
            except Exception:
                logger.debug("Failed to persist spec_review_reply", exc_info=True)
            return decision, feedback
        except asyncio.TimeoutError:
            logger.warning(f"Spec review {spec_review_id} timed out (30 min)")
            # Persist the timeout→parked transition so the frontend can show
            # the parked state after a reload.
            try:
                await sm.append_message(
                    session_id=session_id,
                    role=MessageRole.ASSISTANT,
                    content=f"[Spec Review Timed Out] {spec_review_id}",
                    metadata={
                        "message_type": "spec_review_reply",
                        "spec_review_id": spec_review_id,
                        "spec_review_decision": "parked",
                        "spec_review_feedback": "timed out (30 min)",
                    },
                )
            except Exception:
                logger.debug("Failed to persist spec_review timeout", exc_info=True)
            raise
        finally:
            _pending_spec_reviews.pop(spec_review_id, None)
-    # Wire the handler onto a PlanExecEngine only (the WS PLAN_EXEC path uses
+    # U8/R8: spec review gate wiring. The WS PLAN_EXEC path uses
-    # a ReActEngine + phase_policy, where this is a no-op). Local import to
+    # ``_build_phase_engine`` which returns a ``ReActEngine`` with
-    # avoid a top-level dependency that the WS path doesn't need.
+    # ``phase_policy`` (NOT a ``PlanExecEngine``), so the gate cannot be
-    from agentkit.core.plan_exec_engine import PlanExecEngine as _PlanExecEngine
+    # wired here — ``ReActEngine`` does not read ``_spec_review_handler``.
-
+    # The gate only fires when ``ConfigDrivenAgent.execute_stream`` →
-    if isinstance(react_engine, _PlanExecEngine):
+    # ``_handle_plan_exec_stream`` → ``PlanExecEngine.execute_stream`` runs,
    # which is the portal/task path (not the WS chat path).
    # ponytail: known ceiling — WS chat PLAN_EXEC (phase_policy mechanism)
    # does not support spec review. Upgrade path: route WS PLAN_EXEC through
    # ``ConfigDrivenAgent.execute_stream`` to unify with the portal path and
    # inherit the gate. The ``_spec_review_handler`` closure + event handlers
    # below are kept so the upgrade is a routing change, not a rewrite.
    if hasattr(react_engine, "_spec_review_handler"):
        react_engine._spec_review_handler = _spec_review_handler
    logger.info(
--- a/src/agentkit/server/routes/portal.py
+++ b/src/agentkit/server/routes/portal.py
@ -23,7 +23,7 @@ from pydantic import BaseModel
 from agentkit.core.config_driven import ConfigDrivenAgent
 from agentkit.core.event_queue import EventQueue
-from agentkit.core.protocol import Event, TaskEventType, TaskStatus, TurnEventType
+from agentkit.core.protocol import Event, TaskEventType, TaskMessage, TaskStatus, TurnEventType
 from agentkit.core.react import ReActEngine
 from agentkit.chat.skill_routing import ExecutionMode, SkillRoutingResult
 from agentkit.chat.request_preprocessor import RequestPreprocessor
@ -73,6 +73,42 @@ def _ensure_non_empty(text: str | None) -> str:
    return EMPTY_LLM_RESPONSE
 def _build_portal_task(
    *,
    agent_name: str,
    messages: list[dict[str, str]],
    system_prompt: str | None,
    timeout_seconds: float | None,
    conversation_id: str | None = None,
    task_id: str | None = None,
 ) -> TaskMessage:
    """Construct a TaskMessage for routing through ConfigDrivenAgent.execute_stream.
    The portal builds messages externally (history + user message). The
    ``messages`` key in input_data tells _build_llm_messages to use them
    directly instead of rendering the prompt template. This lets the portal
    inherit evolution hooks + trace_outcome propagation from execute_stream's
    finally block (KTD-4/KTD-8).
    """
    from datetime import datetime, timezone
    return TaskMessage(
        task_id=task_id or str(uuid.uuid4()),
        agent_name=agent_name,
        task_type="chat",
        priority=0,
        input_data={
            "messages": messages,
            "system_prompt": system_prompt,
            "content": messages[-1].get("content", "") if messages else "",
        },
        callback_url=None,
        created_at=datetime.now(timezone.utc),
        timeout_seconds=int(timeout_seconds) if timeout_seconds else 300,
        conversation_id=conversation_id,
    )
 async def _emit_event_safe(
    event_queue: EventQueue | None,
    event_type: str,
@ -556,38 +592,39 @@ async def chat(request: ChatRequest, req: Request, _auth: None = Depends(_verify
            )
        react_config = agent.get_react_config()
-        react_engine = getattr(agent, "_react_engine", None)
+        # KTD-4/KTD-8: route through ConfigDrivenAgent.execute_stream so the
-        if react_engine is None:
+        # finally block fires evolution hooks + propagates trace_outcome. The
-            react_engine = ReActEngine(
+        # portal builds messages externally; _build_portal_task packages them
        # into a TaskMessage whose input_data["messages"] is used directly by
        # _build_llm_messages (bypassing the prompt template).
        _react_engine = getattr(agent, "_react_engine", None)
        if _react_engine is None:
            _react_engine = ReActEngine(
                llm_gateway=llm_gateway,
                max_steps=react_config["max_steps"],
            )
            agent._react_engine = _react_engine
        else:
-            react_engine.reset()
+            _react_engine.reset()
        messages = [{"role": "user", "content": request.message}]
        # Inject conversation history
        history_msgs = await _build_history_messages(conv.id)
        for hm in reversed(history_msgs):
            messages.insert(0, hm)
        tools = agent.get_tools()
        model = agent.get_model()
        system_prompt = getattr(agent, "_system_prompt", None) or agent.get_system_prompt()
        timeout_seconds = react_config["timeout_seconds"]
-        collected_output: list[str] = []
+        portal_task = _build_portal_task(
        try:
            # U2 verify: calls react_engine.execute_stream directly, bypassing
            # ConfigDrivenAgent.execute_stream — evolution hooks NOT propagated
            # here. Routing through agent.execute_stream is tracked separately.
            async for event in react_engine.execute_stream(
                messages=messages,
                tools=tools,
                model=model,
            agent_name=agent.name,
            messages=messages,
            system_prompt=system_prompt,
            timeout_seconds=timeout_seconds,
-            ):
+            conversation_id=conv.id,
        )
        collected_output: list[str] = []
        try:
            async for event in agent.execute_stream(portal_task):
                if event.event_type == "final_answer":
                    collected_output.append(event.data.get("output", ""))
        except asyncio.CancelledError:
@ -684,34 +721,32 @@ async def chat_stream(request: ChatRequest, req: Request, _auth: None = Depends(
                )
            react_config = agent.get_react_config()
-            react_engine = getattr(agent, "_react_engine", None)
+            # KTD-4/KTD-8: route through ConfigDrivenAgent.execute_stream
-            if react_engine is None:
+            # (evolution hooks + trace_outcome propagation in finally block).
-                react_engine = ReActEngine(
+            _react_engine = getattr(agent, "_react_engine", None)
            if _react_engine is None:
                _react_engine = ReActEngine(
                    llm_gateway=llm_gateway,
                    max_steps=react_config["max_steps"],
                )
                agent._react_engine = _react_engine
            else:
-                react_engine.reset()
+                _react_engine.reset()
            messages = [{"role": "user", "content": request.message}]
            tools = agent.get_tools()
            model = agent.get_model()
            system_prompt = getattr(agent, "_system_prompt", None) or agent.get_system_prompt()
            timeout_seconds = react_config["timeout_seconds"]
-            collected_output: list[str] = []
+            portal_task = _build_portal_task(
            try:
                # U2 verify: calls react_engine.execute_stream directly, bypassing
                # ConfigDrivenAgent.execute_stream — evolution hooks NOT propagated
                # here. Routing through agent.execute_stream is tracked separately.
                async for event in react_engine.execute_stream(
                    messages=messages,
                    tools=tools,
                    model=model,
                agent_name=agent.name,
                messages=messages,
                system_prompt=system_prompt,
                timeout_seconds=timeout_seconds,
-                ):
+                conversation_id=conv.id,
            )
            collected_output: list[str] = []
            try:
                async for event in agent.execute_stream(portal_task):
                    if event.event_type == "final_answer":
                        collected_output.append(event.data.get("output", ""))
                    yield {
@ -967,11 +1002,8 @@ def _derive_title_from_messages(messages: list) -> str:
 async def _execute_react_background(
-    react_engine: ReActEngine,
+    agent: ConfigDrivenAgent,
    messages: list[dict],
    tools: list,
    model: str,
    agent_name: str,
    system_prompt: str | None,
    timeout_seconds: float | None,
    conv_id: str,
@ -987,6 +1019,10 @@ async def _execute_react_background(
    Results are always persisted to the conversation store, regardless of
    whether a WebSocket subscriber is active.
    Task status is tracked in TaskStore when provided.
    KTD-4/KTD-8: routes through ``agent.execute_stream`` (not
    ``react_engine.execute_stream`` directly) so the finally block fires
    evolution hooks and propagates trace_outcome.
    """
    collected_output: list[str] = []
    try:
@ -1005,17 +1041,15 @@ async def _execute_react_background(
            ):
                logger.warning("Failed to update TaskStore RUNNING", exc_info=True)
-        # U2 verify: calls react_engine.execute_stream directly, bypassing
+        portal_task = _build_portal_task(
-        # ConfigDrivenAgent.execute_stream — evolution hooks NOT propagated
+            agent_name=agent.name,
        # here. Routing through agent.execute_stream is tracked separately.
        async for event in react_engine.execute_stream(
            messages=messages,
            tools=tools,
            model=model,
            agent_name=agent_name,
            system_prompt=system_prompt,
            timeout_seconds=timeout_seconds,
-        ):
+            conversation_id=conv_id,
            task_id=task_id,
        )
        async for event in agent.execute_stream(portal_task):
            if event.event_type == "final_answer":
                collected_output.append(event.data.get("output", ""))
@ -1219,6 +1253,14 @@ async def portal_websocket(websocket: WebSocket):
    task_id: str | None = None
    # Track the active background task so cancel can propagate to it.
    active_bg_task: asyncio.Task | None = None
    # U8/R8: pending spec review futures. The portal WS path doesn't wire
    # _spec_review_handler on the agent (the background task architecture
    # makes EventQueue-based request/reply non-trivial), so this dict is
    # typically empty. It exists so stale spec_review_reply messages from
    # the frontend are handled gracefully instead of silently ignored.
    # ponytail: upgrade path — wire _spec_review_handler via EventQueue +
    # future, mirroring chat.py's _spec_review_handler closure.
    pending_spec_reviews: dict[str, asyncio.Future[tuple[str, str]]] = {}
    try:
        while True:
@ -1256,6 +1298,32 @@ async def portal_websocket(websocket: WebSocket):
                await websocket.send_json({"type": "pong"})
                continue
            if msg_type == "spec_review_reply":
                # U8/R8: mirror chat.py:1126 — resolve a pending spec review
                # future. Typically a no-op in the portal WS path (the
                # _spec_review_handler isn't wired), but handles stale replies
                # gracefully.
                spec_review_id = msg.get("spec_review_id")
                decision = msg.get("decision", "rejected")
                feedback = msg.get("feedback", "")
                logger.info(
                    f"Received spec_review_reply: id={spec_review_id!r}, decision={decision!r}"
                )
                if spec_review_id and spec_review_id in pending_spec_reviews:
                    fut = pending_spec_reviews[spec_review_id]
                    if not fut.done():
                        fut.set_result((decision, feedback))
                    else:
                        logger.warning(
                            f"spec_review_reply {spec_review_id!r} already resolved"
                        )
                else:
                    logger.warning(
                        f"spec_review_reply {spec_review_id!r} not found in "
                        f"pending_spec_reviews — ignoring"
                    )
                continue
            if msg_type == "resume":
                # Frontend reconnected and wants to resume a running task
                resume_task_id = msg.get("task_id", "")
@ -1800,15 +1868,17 @@ async def portal_websocket(websocket: WebSocket):
            # Execute via ReAct stream
            react_config = agent.get_react_config()
-            # Reuse agent's ReActEngine if available (aligned with chat.py pattern)
+            # KTD-4/KTD-8: route through ConfigDrivenAgent.execute_stream
-            react_engine = getattr(agent, "_react_engine", None)
+            # (evolution hooks + trace_outcome propagation in finally block).
-            if react_engine is None:
+            _react_engine = getattr(agent, "_react_engine", None)
-                react_engine = ReActEngine(
+            if _react_engine is None:
                _react_engine = ReActEngine(
                    llm_gateway=llm_gateway,
                    max_steps=react_config["max_steps"],
                )
                agent._react_engine = _react_engine
            else:
-                react_engine.reset()
+                _react_engine.reset()
            messages = [{"role": "user", "content": message_text}]
            # Inject conversation history for context continuity
@ -1829,11 +1899,8 @@ async def portal_websocket(websocket: WebSocket):
            # background task continues running and persists the result.
            bg_task = asyncio.create_task(
                _execute_react_background(
-                    react_engine=react_engine,
+                    agent=agent,
                    messages=messages,
                    tools=tools,
                    model=model,
                    agent_name=agent.name,
                    system_prompt=system_prompt,
                    timeout_seconds=timeout_seconds,
                    conv_id=conv.id,
--- a/tests/unit/server/test_portal_ws_background.py
+++ b/tests/unit/server/test_portal_ws_background.py
@ -38,10 +38,12 @@ class FakeConversationStore:
 class FakeReactEngine:
    """Fake ReAct engine that yields events from a predefined list."""
    name = "test-agent"
    def __init__(self, events: list[Event]) -> None:
        self._events = events
-    async def execute_stream(self, **kwargs):
+    async def execute_stream(self, task):
        for event in self._events:
            yield event
@ -49,11 +51,13 @@ class FakeReactEngine:
 class FailingReactEngine:
    """Fake ReAct engine that raises an exception after yielding some events."""
    name = "test-agent"
    def __init__(self, events: list[Event], error: Exception) -> None:
        self._events = events
        self._error = error
-    async def execute_stream(self, **kwargs):
+    async def execute_stream(self, task):
        for event in self._events:
            yield event
        raise self._error
@ -76,11 +80,13 @@ def _make_event(
 class SlowFakeReactEngine:
    """Fake ReAct engine with a delay to allow status checks during execution."""
    name = "test-agent"
    def __init__(self, events: list[Event], delay: float = 0.1) -> None:
        self._events = events
        self._delay = delay
-    async def execute_stream(self, **kwargs):
+    async def execute_stream(self, task):
        for event in self._events:
            await asyncio.sleep(self._delay)
            yield event
@ -93,11 +99,13 @@ class CancellableReactEngine:
    Event so the test can cancel the task and verify CancelledError cleanup.
    """
    name = "test-agent"
    def __init__(self, first_event: Event) -> None:
        self._first_event = first_event
        self.started = asyncio.Event()
-    async def execute_stream(self, **kwargs):
+    async def execute_stream(self, task):
        yield self._first_event
        self.started.set()
        # Block forever until cancelled
@ -130,11 +138,8 @@ class TestExecuteReactBackground:
        eq = EventQueue()
        await _execute_react_background(
-            react_engine=engine,
+            agent=engine,
            messages=[],
            tools=[],
            model="test-model",
            agent_name="test-agent",
            system_prompt=None,
            timeout_seconds=None,
            conv_id="test-conv",
@ -162,11 +167,8 @@ class TestExecuteReactBackground:
        eq = EventQueue()
        await _execute_react_background(
-            react_engine=engine,
+            agent=engine,
            messages=[],
            tools=[],
            model="test-model",
            agent_name="test-agent",
            system_prompt=None,
            timeout_seconds=None,
            conv_id="test-conv",
@ -190,11 +192,8 @@ class TestExecuteReactBackground:
        eq = EventQueue()
        await _execute_react_background(
-            react_engine=engine,
+            agent=engine,
            messages=[],
            tools=[],
            model="test-model",
            agent_name="test-agent",
            system_prompt=None,
            timeout_seconds=None,
            conv_id="test-conv",
@ -228,11 +227,8 @@ class TestExecuteReactBackground:
        await asyncio.sleep(0.05)
        await _execute_react_background(
-            react_engine=engine,
+            agent=engine,
            messages=[],
            tools=[],
            model="test-model",
            agent_name="test-agent",
            system_prompt=None,
            timeout_seconds=None,
            conv_id="test-conv",
@ -270,11 +266,8 @@ class TestExecuteReactBackground:
        await asyncio.sleep(0.05)
        await _execute_react_background(
-            react_engine=engine,
+            agent=engine,
            messages=[],
            tools=[],
            model="test-model",
            agent_name="test-agent",
            system_prompt=None,
            timeout_seconds=None,
            conv_id="test-conv",
@ -318,11 +311,8 @@ class TestTaskStoreIntegration:
        # Start background task
        bg_task = asyncio.create_task(
            _execute_react_background(
-                react_engine=engine,
+                agent=engine,
                messages=[],
                tools=[],
                model="test-model",
                agent_name="test-agent",
                system_prompt=None,
                timeout_seconds=None,
                conv_id="test-conv",
@ -365,11 +355,8 @@ class TestTaskStoreIntegration:
        )
        await _execute_react_background(
-            react_engine=engine,
+            agent=engine,
            messages=[],
            tools=[],
            model="test-model",
            agent_name="test-agent",
            system_prompt=None,
            timeout_seconds=None,
            conv_id="test-conv",
@ -394,11 +381,8 @@ class TestTaskStoreIntegration:
        # Should not raise
        await _execute_react_background(
-            react_engine=engine,
+            agent=engine,
            messages=[],
            tools=[],
            model="test-model",
            agent_name="test-agent",
            system_prompt=None,
            timeout_seconds=None,
            conv_id="test-conv",
@ -552,11 +536,8 @@ class TestCancelledErrorPath:
        bg_task = asyncio.create_task(
            _execute_react_background(
-                react_engine=engine,
+                agent=engine,
                messages=[],
                tools=[],
                model="test-model",
                agent_name="test-agent",
                system_prompt=None,
                timeout_seconds=None,
                conv_id="test-conv",
@ -590,11 +571,8 @@ class TestCancelledErrorPath:
        bg_task = asyncio.create_task(
            _execute_react_background(
-                react_engine=engine,
+                agent=engine,
                messages=[],
                tools=[],
                model="test-model",
                agent_name="test-agent",
                system_prompt=None,
                timeout_seconds=None,
                conv_id="test-conv",
@ -636,11 +614,8 @@ class TestCancelledErrorPath:
        bg_task = asyncio.create_task(
            _execute_react_background(
-                react_engine=engine,
+                agent=engine,
                messages=[],
                tools=[],
                model="test-model",
                agent_name="test-agent",
                system_prompt=None,
                timeout_seconds=None,
                conv_id="test-conv",
@ -769,11 +744,8 @@ class TestCancelPropagation:
        # Simulate the background task as portal.py would create it
        active_bg_task: asyncio.Task | None = asyncio.create_task(
            _execute_react_background(
-                react_engine=engine,
+                agent=engine,
                messages=[],
                tools=[],
                model="test-model",
                agent_name="test-agent",
                system_prompt=None,
                timeout_seconds=None,
                conv_id="cancel-conv",
@ -814,11 +786,8 @@ class TestCancelPropagation:
        bg_task = asyncio.create_task(
            _execute_react_background(
-                react_engine=engine,
+                agent=engine,
                messages=[],
                tools=[],
                model="test-model",
                agent_name="test-agent",
                system_prompt=None,
                timeout_seconds=None,
                conv_id="test-conv",
@ -865,11 +834,8 @@ class TestWebSocketDisconnectNoCancel:
        # Start the background task (as portal.py would)
        bg_task = asyncio.create_task(
            _execute_react_background(
-                react_engine=engine,
+                agent=engine,
                messages=[],
                tools=[],
                model="test-model",
                agent_name="test-agent",
                system_prompt=None,
                timeout_seconds=None,
                conv_id="test-conv",
@ -912,11 +878,8 @@ class TestWebSocketDisconnectNoCancel:
        bg_task = asyncio.create_task(
            _execute_react_background(
-                react_engine=engine,
+                agent=engine,
                messages=[],
                tools=[],
                model="test-model",
                agent_name="test-agent",
                system_prompt=None,
                timeout_seconds=None,
                conv_id="resume-conv",
--- a/tests/unit/test_execute_stream_hooks.py
+++ b/tests/unit/test_execute_stream_hooks.py
@ -112,7 +112,8 @@ class TestExecuteStreamHooks:
        assert events[0].event_type == "final_answer"
        assert len(fired) == 1
        assert fired[0].status == TaskStatus.COMPLETED
-        assert fired[0].output_data == {"content": "hello world"}
+        # KTD-8: output_data includes trace_outcome for lifecycle._is_failure_path()
        assert fired[0].output_data == {"content": "hello world", "trace_outcome": "success"}
    async def test_failure_fires_on_task_failed(self):
        """Stream exception fires evolve_after_task with FAILED status."""