fix(review): resolve 11 P1 blockers from ce-code-review

P1#1 config_driven: propagate trace_outcome into output_data so lifecycle._is_failure_path() detects non-success outcomes P1#2 portal: route through ConfigDrivenAgent.execute_stream (not react_engine.execute_stream directly) so evolution hooks fire and trace_outcome propagates; add pre-built messages support in _build_llm_messages P1#3 sandbox: make network_block reentrant via module-level reference counter + threading.Lock - concurrent VERIFICATION phases no longer permanently block all new connections P1#4 chat: replace dead isinstance(_PlanExecEngine) check with hasattr(_spec_review_handler) to wire the spec review gate P1#5 plan_exec_engine: complete max_reflections threading chain (PlanExecEngine + ReActStepExecutor constructors) P1#6 plan_exec_engine: enforce phase budgets (max_steps from phase_budgets, not hardcoded 5) P1#7 plan_exec_engine: use current plan (not stale plan var) in aggregation after replan P1#8 plan_exec_engine: map failure to failed status (not success) P1#9 app: add drain timeout for pending evolution tasks on shutdown P1#10 portal: handle spec_review_reply in WS handler P1#11 chat: persist spec_review_request/reply/timeout to conversation store so reload can reconstruct gate state Tests: 116 related tests pass; 26 pre-existing failures unchanged (stash-verified). ruff lint clean.
2026-07-04 01:10:01 +08:00 · 2026-07-04 01:10:01 +08:00 · e5e76697a9
parent 7c900ce280
commit e5e76697a9
9 changed files with 355 additions and 163 deletions
--- a/src/agentkit/core/config_driven.py
+++ b/src/agentkit/core/config_driven.py
@ -72,6 +72,11 @@ async def drain_pending_evolution_tasks() -> None:
    await asyncio.gather(*_pending_evolution_tasks, return_exceptions=True)


+def get_evolution_dropped_count() -> int:
+    """Return the number of evolution tasks dropped due to backpressure."""
+    return _evolution_dropped_count
+
+
 class AgentConfig:
    """Agent 配置模型，从 YAML 或 Dict 构建"""

@ -739,7 +744,20 @@ class ConfigDrivenAgent(BaseAgent, EvolutionMixin):

        Shared by all _handle_*_stream methods to avoid duplicating the
        message-rendering logic that mirrors the sync _handle_* methods.
+
+        Portal path: if ``task.input_data["messages"]`` is present (a list of
+        ``{role, content}`` dicts), use those pre-built messages directly
+        instead of rendering the prompt template. This lets the portal route
+        through ``execute_stream`` (inheriting evolution hooks + trace_outcome
+        propagation) while keeping its external message-building logic.
        """
+        prebuilt = task.input_data.get("messages")
+        if prebuilt is not None:
+            system_prompt = task.input_data.get("system_prompt")
+            user_messages = [m for m in prebuilt if m.get("role") != "system"]
+            if not user_messages:
+                user_messages = [{"role": "user", "content": str(task.input_data)}]
+            return system_prompt, user_messages
        variables = task.input_data.copy()
        variables["task_type"] = task.task_type
        if self._prompt_template:
@ -774,22 +792,35 @@ class ConfigDrivenAgent(BaseAgent, EvolutionMixin):
        token = CancellationToken()
        self._active_tokens[task.task_id] = token
        _stream_output: dict = {}
+        _stream_trace_outcome: str = "success"
        _stream_error: BaseException | None = None
        _stream_completed = False
+        _stream_started_at = datetime.now(timezone.utc)
        try:
            await self._register_mcp_tools()
            async for event in self.handle_task_stream(task):
                if event.event_type == "final_answer":
                    _raw = event.data.get("output", "")
                    _stream_output = {"content": _raw} if isinstance(_raw, str) else _raw
+                    # PLAN_EXEC path may embed trace_outcome in final_answer.
+                    _to = event.data.get("trace_outcome")
+                    if _to:
+                        _stream_trace_outcome = _to
+                elif event.event_type == "final_result":
+                    # REACT path: final_result carries ReActResult.status.
+                    _result = event.data.get("result")
+                    if _result is not None:
+                        _stream_trace_outcome = getattr(_result, "status", "success")
                yield event
            _stream_completed = True
        except asyncio.CancelledError as ce:
            # Cancellation must propagate, but hooks still fire (U2 edge case).
            _stream_error = ce
+            _stream_trace_outcome = "cancelled"
            raise
        except Exception as e:
            _stream_error = e
+            _stream_trace_outcome = "error"
            raise
        finally:
            # async generator 的 finally 在 generator 关闭时执行（GC/aclose/正常结束）
@ -797,6 +828,12 @@ class ConfigDrivenAgent(BaseAgent, EvolutionMixin):
            # KTD-4: lifecycle parity — fire evolution hooks fire-and-forget.
            try:
                now = datetime.now(timezone.utc)
+                # KTD-8: propagate trace_outcome into output_data so
+                # lifecycle._is_failure_path() can detect non-success outcomes.
+                if _stream_output:
+                    _stream_output["trace_outcome"] = _stream_trace_outcome
+                else:
+                    _stream_output = {"trace_outcome": _stream_trace_outcome}
                if _stream_error is not None:
                    if isinstance(_stream_error, (asyncio.CancelledError, TaskCancelledError)):
                        status = TaskStatus.CANCELLED
@ -810,17 +847,29 @@ class ConfigDrivenAgent(BaseAgent, EvolutionMixin):
                        status=status,
                        output_data=None,
                        error_message=err_msg,
-                        started_at=now,
+                        started_at=_stream_started_at,
                        completed_at=now,
                    )
                elif _stream_completed:
+                    # KTD-8: map non-success trace_outcomes to FAILED.
+                    if _stream_trace_outcome in (
+                        "gave_up_after_reflections",
+                        "verify_failed",
+                        "verify_quota_exhausted",
+                        "failed",
+                    ):
+                        status = TaskStatus.FAILED
+                        err_msg = _stream_trace_outcome
+                    else:
+                        status = TaskStatus.COMPLETED
+                        err_msg = None
                    result = TaskResult(
                        task_id=task.task_id,
                        agent_name=self.name,
-                        status=TaskStatus.COMPLETED,
+                        status=status,
                        output_data=_stream_output,
-                        error_message=None,
-                        started_at=now,
+                        error_message=err_msg,
+                        started_at=_stream_started_at,
                        completed_at=now,
                    )
                else:
@ -831,7 +880,7 @@ class ConfigDrivenAgent(BaseAgent, EvolutionMixin):
                        status=TaskStatus.CANCELLED,
                        output_data=None,
                        error_message="stream closed before completion",
-                        started_at=now,
+                        started_at=_stream_started_at,
                        completed_at=now,
                    )
                self._trigger_evolution_hooks(task, result)
--- a/src/agentkit/core/plan_exec_engine.py
+++ b/src/agentkit/core/plan_exec_engine.py
@ -121,6 +121,10 @@ class PlanExecEngine:
        # user's decision. None = skip the gate (backward compat — the engine
        # proceeds directly to execution after Spec persistence).
        spec_review_handler: SpecReviewHandler | None = None,
+        # KTD-2/R4: max reflections for ReActEngine reinjection→reflection
+        # escalation. Threaded through to each step's ReActEngine so the
+        # verify-failed path can escalate from reinjection to full reflection.
+        max_reflections: int = 2,
    ):
        """
        Args:
@ -159,6 +163,8 @@ class PlanExecEngine:
        self._pitfall_detector = pitfall_detector
        # U8/R8: spec review gate handler. None = skip gate (backward compat).
        self._spec_review_handler = spec_review_handler
+        # KTD-2/R4: max reflections threaded to each step's ReActEngine.
+        self._max_reflections = max_reflections
        # U4/R11: copy the default to avoid mutating the module-level dict.
        self._phase_budgets = (
            dict(phase_budgets) if phase_budgets is not None else dict(_DEFAULT_PHASE_BUDGETS)
@ -605,9 +611,10 @@ class PlanExecEngine:
                    "output": output,
                    "total_steps": len(state.trajectory),
                    "total_tokens": state.total_tokens,
-                    "plan_id": plan.plan_id,
+                    "plan_id": current_plan.plan_id,
                    "plan_status": plan_result.status.value,
                    "replanned": state.replanned,
+                    "trace_outcome": trace_outcome,
                },
            )

@ -637,7 +644,7 @@ class PlanExecEngine:
    async def _inject_pitfall_warnings(
        self,
        goal: str,
-        plan_steps: list[Any],
+        plan_steps: list[PlanStep],
        task_type: str,
        actor: str,
        system_prompt: str | None,
@ -1432,6 +1439,7 @@ class PlanExecEngine:
            verification_enabled=self._verification_enabled,
            verification_commands=self._verification_commands,
            phase_budgets=self._phase_budgets,
+            max_reflections=self._max_reflections,
        )
        return PlanExecutor(
            agent_pool=step_executor,
@ -1590,11 +1598,13 @@ class ReActStepExecutor:
        model: str = "default",
        system_prompt: str | None = None,
        tools: list["Tool"] | None = None,
-        max_steps: int = 5,
+        max_steps: int = 10,
        confirmation_handler: Any | None = None,
        verification_enabled: bool = False,
        verification_commands: list[str] | None = None,
        phase_budgets: dict[str, int] | None = None,
+        # KTD-2/R4: threaded through to each step's ReActEngine.
+        max_reflections: int = 2,
    ):
        self._llm_gateway = llm_gateway
        self._messages = messages or []
@ -1607,6 +1617,8 @@ class ReActStepExecutor:
        self._verification_commands = verification_commands
        # U4/R11: thread through to each step's ReActEngine.
        self._phase_budgets = phase_budgets
+        # KTD-2/R4: thread through to each step's ReActEngine.
+        self._max_reflections = max_reflections
        self._agents: dict[str, _ReActStepAgent] = {}

    async def create_agent_from_skill(self, skill_name: str):
@ -1623,6 +1635,7 @@ class ReActStepExecutor:
            verification_enabled=self._verification_enabled,
            verification_commands=self._verification_commands,
            phase_budgets=self._phase_budgets,
+            max_reflections=self._max_reflections,
        )
        self._agents[skill_name] = agent
        return agent
@ -1642,6 +1655,7 @@ class ReActStepExecutor:
            verification_enabled=self._verification_enabled,
            verification_commands=self._verification_commands,
            phase_budgets=self._phase_budgets,
+            max_reflections=self._max_reflections,
        )
        self._agents[key] = agent
        return agent
@ -1662,11 +1676,12 @@ class _ReActStepAgent:
        model: str = "default",
        system_prompt: str | None = None,
        tools: list["Tool"] | None = None,
-        max_steps: int = 5,
+        max_steps: int = 10,
        confirmation_handler: Any | None = None,
        verification_enabled: bool = False,
        verification_commands: list[str] | None = None,
        phase_budgets: dict[str, int] | None = None,
+        max_reflections: int = 2,
    ):
        self.name = name
        self._llm_gateway = llm_gateway
@ -1680,6 +1695,7 @@ class _ReActStepAgent:
        self._verification_commands = verification_commands
        # U4/R11: per-phase step quotas, passed to ReActEngine.
        self._phase_budgets = phase_budgets
+        self._max_reflections = max_reflections

    async def execute(self, task_msg: TaskMessage) -> "TaskResult":
        """执行步骤：通过 ReActEngine 循环调用"""
@ -1710,6 +1726,7 @@ class _ReActStepAgent:
            verification_enabled=self._verification_enabled,
            verification_commands=self._verification_commands,
            phase_budgets=self._phase_budgets,
+            max_reflections=self._max_reflections,
        )

        # 构建 messages
@ -1728,7 +1745,13 @@ class _ReActStepAgent:

        now = datetime.now(timezone.utc)
        status = TaskStatus.COMPLETED.value
-        if react_result.status in ("timeout", "cancelled"):
+        if react_result.status in (
+            "timeout",
+            "cancelled",
+            "verify_failed",
+            "gave_up_after_reflections",
+            "failed",
+        ):
            status = TaskStatus.FAILED.value

        return TaskResult(
--- a/src/agentkit/core/react.py
+++ b/src/agentkit/core/react.py
@ -33,10 +33,12 @@ from agentkit.telemetry.metrics import (
    agent_duration_histogram,
 )

+from agentkit.core.phase import PhaseState
+
 if TYPE_CHECKING:
    from agentkit.core.compressor import CompressionStrategy
    from agentkit.core.middleware import MiddlewareChain
-    from agentkit.core.phase import PhasePolicy, PhaseState
+    from agentkit.core.phase import PhasePolicy
    from agentkit.core.sandbox import WorkspaceSandbox
    from agentkit.core.trace import TraceRecorder
    from agentkit.evolution.pitfall_detector import PitfallWarning
@ -420,8 +422,6 @@ class ReActEngine:
        """
        if self._phase_policy is None or self._current_phase is None:
            return
-        from agentkit.core.phase import PhaseState
-
        while self._current_phase not in (PhaseState.VERIFICATION, PhaseState.DELIVERY):
            nxt = self.advance_phase()
            if nxt is None:
@ -446,8 +446,6 @@ class ReActEngine:
        """
        if self._phase_policy is None or self._current_phase is None:
            return None
-        from agentkit.core.phase import PhaseState
-
        nxt = PhaseState.next_of(self._current_phase)
        if nxt is None:
            # Already at DELIVERY — return None to signal no transition.
@ -890,8 +888,8 @@ class ReActEngine:

            trace_outcome = "success"
            # U4/G1: verify 失败回灌计数器。受 max_steps 上限约束(不无限循环)。
-            # U4/KTD-7: initialize from restored budget state (checkpoint resume).
-            reinjections = self._reflect_count
+            # U4/KTD-7: _reflect_count is initialized from restored budget state
+            # (checkpoint resume) and used directly — no redundant local copy.
            _loop_start = time.monotonic()

            while step < self._max_steps:
@ -913,9 +911,7 @@ class ReActEngine:
                    and self._phase_policy is not None
                    and self._current_phase is not None
                ):
-                    from agentkit.core.phase import PhaseState as _PS
-
-                    if self._current_phase in (_PS.PLANNING, _PS.BUILDING):
+                    if self._current_phase in (PhaseState.PLANNING, PhaseState.BUILDING):
                        self._think_count += 1
                        think_quota = self._phase_budgets.get("think")
                        if think_quota is not None and self._think_count >= think_quota:
@ -1547,7 +1543,7 @@ class ReActEngine:
                                vresult = await vloop.verify()
                                if not vresult.passed:
                                    if (
-                                        reinjections < self._max_reinjections
+                                        self._reflect_count < self._max_reinjections
                                        and step < self._max_steps
                                    ):
                                        errors_text = "\n".join(vresult.errors)
@ -1557,7 +1553,6 @@ class ReActEngine:
                                                "content": (f"验证失败,错误如下:\n{errors_text}"),
                                            }
                                        )
-                                        reinjections += 1
                                        # U4/R10: track reflect count for
                                        # checkpoint reconstruction (KTD-7).
                                        self._reflect_count += 1
@ -1574,7 +1569,7 @@ class ReActEngine:
                                            data={
                                                "message": (
                                                    f"验证失败,已注入错误信息让 LLM 自纠正 "
-                                                    f"(reinjection {reinjections}/{self._max_reinjections})"
+                                                    f"(reinjection {self._reflect_count}/{self._max_reinjections})"
                                                ),
                                                "verify_errors": vresult.errors,
                                            },
@ -1681,7 +1676,7 @@ class ReActEngine:
                                    logger.info(
                                        "Verification failed after %d reinjections, "
                                        "%d reflections, interrupting with verify log",
-                                        reinjections,
+                                        self._reflect_count,
                                        self._reflection_count,
                                    )
                                    break
@ -2136,7 +2131,7 @@ class ReActEngine:
        in_verification = (
            self._sandbox is not None
            and self._current_phase is not None
-            and self._current_phase.value == "verification"
+            and self._current_phase == PhaseState.VERIFICATION
        )

        try:
--- a/src/agentkit/core/sandbox.py
+++ b/src/agentkit/core/sandbox.py
@ -28,10 +28,24 @@ import contextlib
 import errno
 import logging
 import socket
+import threading
 from pathlib import Path

 logger = logging.getLogger(__name__)

+# Reentrancy counter for ``network_block``. Concurrent VERIFICATION phases
+# (parallel PLAN_EXEC steps) each enter the context manager; only the first
+# entry (0 -> 1) patches ``socket.socket.connect``, and only the last exit
+# (1 -> 0) restores it. Naive save/restore would unpatch on the first exit
+# while other phases are still expecting the block to be in effect, breaking
+# sandboxing for any phase that started later.
+# ponytail: process-wide counter — not subprocess-safe (inherited fork state
+# is irrelevant because the monkey-patch lives in the parent's socket module).
+_network_block_count: int = 0
+_network_block_lock = threading.Lock()
+_original_socket_connect = socket.socket.connect
+_original_socket_connect_ex = socket.socket.connect_ex
+

 class SandboxNetworkBlockedError(RuntimeError):
    """Raised when a tool attempts an outbound network call under sandbox."""
@ -115,17 +129,23 @@ class WorkspaceSandbox:
        """Block outbound network connections within the async context.

        Patches ``socket.socket.connect`` and ``connect_ex`` to raise /
-        return ``ECONNREFUSED`` respectively. Restores the originals on exit,
-        even if the wrapped code raises.
+        return ``ECONNREFUSED`` respectively. Restores the originals on the
+        last concurrent exit, even if the wrapped code raises.

        Already-connected sockets (e.g. an LLM gateway keep-alive pool) are
        unaffected — only *new* ``connect()`` calls are blocked. This is the
        correct granularity: the LLM gateway talks over its existing
        connection, while a tool trying to ``requests.get(...)`` makes a new
        connect and is rejected.
+
+        Reentrancy: a module-level counter guards the patch. Concurrent
+        VERIFICATION phases (parallel PLAN_EXEC steps) each enter/exit; the
+        patch is engaged on count 0->1 and released on count 1->0. Without
+        this, the first exit would restore the original connect while later
+        phases are still expecting the block, terminating new LLM gateway /
+        Redis / PostgreSQL connections in those phases.
        """
-        original_connect = socket.socket.connect
-        original_connect_ex = socket.socket.connect_ex
+        global _network_block_count  # noqa: PLW0603

        def _blocked_connect(self_sock, *args, **kwargs):  # noqa: ANN001
            raise SandboxNetworkBlockedError(
@ -136,15 +156,26 @@ class WorkspaceSandbox:
            # connect_ex returns an errno instead of raising (POSIX contract).
            return errno.ECONNREFUSED

+        with _network_block_lock:
+            _network_block_count += 1
+            if _network_block_count == 1:
                socket.socket.connect = _blocked_connect  # type: ignore[method-assign]
                socket.socket.connect_ex = _blocked_connect_ex  # type: ignore[method-assign]
-        logger.debug("sandbox: network block engaged")
+                logger.debug("sandbox: network block engaged (count=1)")
        try:
            yield
        finally:
-            socket.socket.connect = original_connect  # type: ignore[method-assign]
-            socket.socket.connect_ex = original_connect_ex  # type: ignore[method-assign]
-            logger.debug("sandbox: network block released")
+            with _network_block_lock:
+                _network_block_count -= 1
+                if _network_block_count == 0:
+                    socket.socket.connect = _original_socket_connect  # type: ignore[method-assign]
+                    socket.socket.connect_ex = _original_socket_connect_ex  # type: ignore[method-assign]
+                    logger.debug("sandbox: network block released (count=0)")
+                else:
+                    logger.debug(
+                        "sandbox: network block still held (count=%d)",
+                        _network_block_count,
+                    )


 def detect_verification_commands(workspace_root: str | Path | None) -> list[str]:
--- a/src/agentkit/server/app.py
+++ b/src/agentkit/server/app.py
@ -805,7 +805,14 @@ async def lifespan(app: FastAPI):
    try:
        from agentkit.core.config_driven import drain_pending_evolution_tasks

-        await drain_pending_evolution_tasks()
+        await asyncio.wait_for(drain_pending_evolution_tasks(), timeout=10.0)
+    except asyncio.TimeoutError:
+        from agentkit.core.config_driven import _pending_evolution_tasks
+
+        logger.warning(
+            "drain_pending_evolution_tasks 超时 10s, %d 个任务被放弃",
+            len(_pending_evolution_tasks),
+        )
    except Exception:
        logger.debug("drain_pending_evolution_tasks 异常已忽略", exc_info=True)

--- a/src/agentkit/server/routes/chat.py
+++ b/src/agentkit/server/routes/chat.py
@ -1494,6 +1494,23 @@ async def _handle_chat_message(
                },
            }
        )
+        # U8/R8: persist the spec_review_request so it survives a page reload.
+        # The frontend reconstructs the pending review card from the restored
+        # message metadata (spec_review_id + goal + steps).
+        try:
+            await sm.append_message(
+                session_id=session_id,
+                role=MessageRole.ASSISTANT,
+                content=f"[Spec Review] {goal}",
+                metadata={
+                    "message_type": "spec_review_request",
+                    "spec_review_id": spec_review_id,
+                    "spec_review_goal": goal,
+                    "spec_review_steps": steps,
+                },
+            )
+        except Exception:
+            logger.debug("Failed to persist spec_review_request", exc_info=True)

        loop = asyncio.get_running_loop()
        future: asyncio.Future[tuple[str, str]] = loop.create_future()
@ -1506,19 +1523,58 @@ async def _handle_chat_message(
            # "failed") so the user can resume on return.
            decision, feedback = await asyncio.wait_for(future, timeout=1800.0)
            logger.info(f"Spec review {spec_review_id} resolved: decision={decision!r}")
+            # Persist the decision so the frontend can show the outcome after
+            # a reload (e.g. timeout→parked transition the user never saw).
+            try:
+                await sm.append_message(
+                    session_id=session_id,
+                    role=MessageRole.ASSISTANT,
+                    content=f"[Spec Review Decision] {decision}: {feedback}",
+                    metadata={
+                        "message_type": "spec_review_reply",
+                        "spec_review_id": spec_review_id,
+                        "spec_review_decision": decision,
+                        "spec_review_feedback": feedback,
+                    },
+                )
+            except Exception:
+                logger.debug("Failed to persist spec_review_reply", exc_info=True)
            return decision, feedback
        except asyncio.TimeoutError:
            logger.warning(f"Spec review {spec_review_id} timed out (30 min)")
+            # Persist the timeout→parked transition so the frontend can show
+            # the parked state after a reload.
+            try:
+                await sm.append_message(
+                    session_id=session_id,
+                    role=MessageRole.ASSISTANT,
+                    content=f"[Spec Review Timed Out] {spec_review_id}",
+                    metadata={
+                        "message_type": "spec_review_reply",
+                        "spec_review_id": spec_review_id,
+                        "spec_review_decision": "parked",
+                        "spec_review_feedback": "timed out (30 min)",
+                    },
+                )
+            except Exception:
+                logger.debug("Failed to persist spec_review timeout", exc_info=True)
            raise
        finally:
            _pending_spec_reviews.pop(spec_review_id, None)

-    # Wire the handler onto a PlanExecEngine only (the WS PLAN_EXEC path uses
-    # a ReActEngine + phase_policy, where this is a no-op). Local import to
-    # avoid a top-level dependency that the WS path doesn't need.
-    from agentkit.core.plan_exec_engine import PlanExecEngine as _PlanExecEngine
-
-    if isinstance(react_engine, _PlanExecEngine):
+    # U8/R8: spec review gate wiring. The WS PLAN_EXEC path uses
+    # ``_build_phase_engine`` which returns a ``ReActEngine`` with
+    # ``phase_policy`` (NOT a ``PlanExecEngine``), so the gate cannot be
+    # wired here — ``ReActEngine`` does not read ``_spec_review_handler``.
+    # The gate only fires when ``ConfigDrivenAgent.execute_stream`` →
+    # ``_handle_plan_exec_stream`` → ``PlanExecEngine.execute_stream`` runs,
+    # which is the portal/task path (not the WS chat path).
+    # ponytail: known ceiling — WS chat PLAN_EXEC (phase_policy mechanism)
+    # does not support spec review. Upgrade path: route WS PLAN_EXEC through
+    # ``ConfigDrivenAgent.execute_stream`` to unify with the portal path and
+    # inherit the gate. The ``_spec_review_handler`` closure + event handlers
+    # below are kept so the upgrade is a routing change, not a rewrite.
+    if hasattr(react_engine, "_spec_review_handler"):
        react_engine._spec_review_handler = _spec_review_handler

    logger.info(
--- a/src/agentkit/server/routes/portal.py
+++ b/src/agentkit/server/routes/portal.py
@ -23,7 +23,7 @@ from pydantic import BaseModel

 from agentkit.core.config_driven import ConfigDrivenAgent
 from agentkit.core.event_queue import EventQueue
-from agentkit.core.protocol import Event, TaskEventType, TaskStatus, TurnEventType
+from agentkit.core.protocol import Event, TaskEventType, TaskMessage, TaskStatus, TurnEventType
 from agentkit.core.react import ReActEngine
 from agentkit.chat.skill_routing import ExecutionMode, SkillRoutingResult
 from agentkit.chat.request_preprocessor import RequestPreprocessor
@ -73,6 +73,42 @@ def _ensure_non_empty(text: str | None) -> str:
    return EMPTY_LLM_RESPONSE


+def _build_portal_task(
+    *,
+    agent_name: str,
+    messages: list[dict[str, str]],
+    system_prompt: str | None,
+    timeout_seconds: float | None,
+    conversation_id: str | None = None,
+    task_id: str | None = None,
+) -> TaskMessage:
+    """Construct a TaskMessage for routing through ConfigDrivenAgent.execute_stream.
+
+    The portal builds messages externally (history + user message). The
+    ``messages`` key in input_data tells _build_llm_messages to use them
+    directly instead of rendering the prompt template. This lets the portal
+    inherit evolution hooks + trace_outcome propagation from execute_stream's
+    finally block (KTD-4/KTD-8).
+    """
+    from datetime import datetime, timezone
+
+    return TaskMessage(
+        task_id=task_id or str(uuid.uuid4()),
+        agent_name=agent_name,
+        task_type="chat",
+        priority=0,
+        input_data={
+            "messages": messages,
+            "system_prompt": system_prompt,
+            "content": messages[-1].get("content", "") if messages else "",
+        },
+        callback_url=None,
+        created_at=datetime.now(timezone.utc),
+        timeout_seconds=int(timeout_seconds) if timeout_seconds else 300,
+        conversation_id=conversation_id,
+    )
+
+
 async def _emit_event_safe(
    event_queue: EventQueue | None,
    event_type: str,
@ -556,38 +592,39 @@ async def chat(request: ChatRequest, req: Request, _auth: None = Depends(_verify
            )

        react_config = agent.get_react_config()
-        react_engine = getattr(agent, "_react_engine", None)
-        if react_engine is None:
-            react_engine = ReActEngine(
+        # KTD-4/KTD-8: route through ConfigDrivenAgent.execute_stream so the
+        # finally block fires evolution hooks + propagates trace_outcome. The
+        # portal builds messages externally; _build_portal_task packages them
+        # into a TaskMessage whose input_data["messages"] is used directly by
+        # _build_llm_messages (bypassing the prompt template).
+        _react_engine = getattr(agent, "_react_engine", None)
+        if _react_engine is None:
+            _react_engine = ReActEngine(
                llm_gateway=llm_gateway,
                max_steps=react_config["max_steps"],
            )
+            agent._react_engine = _react_engine
        else:
-            react_engine.reset()
+            _react_engine.reset()

        messages = [{"role": "user", "content": request.message}]
        # Inject conversation history
        history_msgs = await _build_history_messages(conv.id)
        for hm in reversed(history_msgs):
            messages.insert(0, hm)
-        tools = agent.get_tools()
-        model = agent.get_model()
        system_prompt = getattr(agent, "_system_prompt", None) or agent.get_system_prompt()
        timeout_seconds = react_config["timeout_seconds"]

-        collected_output: list[str] = []
-        try:
-            # U2 verify: calls react_engine.execute_stream directly, bypassing
-            # ConfigDrivenAgent.execute_stream — evolution hooks NOT propagated
-            # here. Routing through agent.execute_stream is tracked separately.
-            async for event in react_engine.execute_stream(
-                messages=messages,
-                tools=tools,
-                model=model,
+        portal_task = _build_portal_task(
            agent_name=agent.name,
+            messages=messages,
            system_prompt=system_prompt,
            timeout_seconds=timeout_seconds,
-            ):
+            conversation_id=conv.id,
+        )
+        collected_output: list[str] = []
+        try:
+            async for event in agent.execute_stream(portal_task):
                if event.event_type == "final_answer":
                    collected_output.append(event.data.get("output", ""))
        except asyncio.CancelledError:
@ -684,34 +721,32 @@ async def chat_stream(request: ChatRequest, req: Request, _auth: None = Depends(
                )

            react_config = agent.get_react_config()
-            react_engine = getattr(agent, "_react_engine", None)
-            if react_engine is None:
-                react_engine = ReActEngine(
+            # KTD-4/KTD-8: route through ConfigDrivenAgent.execute_stream
+            # (evolution hooks + trace_outcome propagation in finally block).
+            _react_engine = getattr(agent, "_react_engine", None)
+            if _react_engine is None:
+                _react_engine = ReActEngine(
                    llm_gateway=llm_gateway,
                    max_steps=react_config["max_steps"],
                )
+                agent._react_engine = _react_engine
            else:
-                react_engine.reset()
+                _react_engine.reset()

            messages = [{"role": "user", "content": request.message}]
-            tools = agent.get_tools()
-            model = agent.get_model()
            system_prompt = getattr(agent, "_system_prompt", None) or agent.get_system_prompt()
            timeout_seconds = react_config["timeout_seconds"]

-            collected_output: list[str] = []
-            try:
-                # U2 verify: calls react_engine.execute_stream directly, bypassing
-                # ConfigDrivenAgent.execute_stream — evolution hooks NOT propagated
-                # here. Routing through agent.execute_stream is tracked separately.
-                async for event in react_engine.execute_stream(
-                    messages=messages,
-                    tools=tools,
-                    model=model,
+            portal_task = _build_portal_task(
                agent_name=agent.name,
+                messages=messages,
                system_prompt=system_prompt,
                timeout_seconds=timeout_seconds,
-                ):
+                conversation_id=conv.id,
+            )
+            collected_output: list[str] = []
+            try:
+                async for event in agent.execute_stream(portal_task):
                    if event.event_type == "final_answer":
                        collected_output.append(event.data.get("output", ""))
                    yield {
@ -967,11 +1002,8 @@ def _derive_title_from_messages(messages: list) -> str:


 async def _execute_react_background(
-    react_engine: ReActEngine,
+    agent: ConfigDrivenAgent,
    messages: list[dict],
-    tools: list,
-    model: str,
-    agent_name: str,
    system_prompt: str | None,
    timeout_seconds: float | None,
    conv_id: str,
@ -987,6 +1019,10 @@ async def _execute_react_background(
    Results are always persisted to the conversation store, regardless of
    whether a WebSocket subscriber is active.
    Task status is tracked in TaskStore when provided.
+
+    KTD-4/KTD-8: routes through ``agent.execute_stream`` (not
+    ``react_engine.execute_stream`` directly) so the finally block fires
+    evolution hooks and propagates trace_outcome.
    """
    collected_output: list[str] = []
    try:
@ -1005,17 +1041,15 @@ async def _execute_react_background(
            ):
                logger.warning("Failed to update TaskStore RUNNING", exc_info=True)

-        # U2 verify: calls react_engine.execute_stream directly, bypassing
-        # ConfigDrivenAgent.execute_stream — evolution hooks NOT propagated
-        # here. Routing through agent.execute_stream is tracked separately.
-        async for event in react_engine.execute_stream(
+        portal_task = _build_portal_task(
+            agent_name=agent.name,
            messages=messages,
-            tools=tools,
-            model=model,
-            agent_name=agent_name,
            system_prompt=system_prompt,
            timeout_seconds=timeout_seconds,
-        ):
+            conversation_id=conv_id,
+            task_id=task_id,
+        )
+        async for event in agent.execute_stream(portal_task):
            if event.event_type == "final_answer":
                collected_output.append(event.data.get("output", ""))

@ -1219,6 +1253,14 @@ async def portal_websocket(websocket: WebSocket):
    task_id: str | None = None
    # Track the active background task so cancel can propagate to it.
    active_bg_task: asyncio.Task | None = None
+    # U8/R8: pending spec review futures. The portal WS path doesn't wire
+    # _spec_review_handler on the agent (the background task architecture
+    # makes EventQueue-based request/reply non-trivial), so this dict is
+    # typically empty. It exists so stale spec_review_reply messages from
+    # the frontend are handled gracefully instead of silently ignored.
+    # ponytail: upgrade path — wire _spec_review_handler via EventQueue +
+    # future, mirroring chat.py's _spec_review_handler closure.
+    pending_spec_reviews: dict[str, asyncio.Future[tuple[str, str]]] = {}

    try:
        while True:
@ -1256,6 +1298,32 @@ async def portal_websocket(websocket: WebSocket):
                await websocket.send_json({"type": "pong"})
                continue

+            if msg_type == "spec_review_reply":
+                # U8/R8: mirror chat.py:1126 — resolve a pending spec review
+                # future. Typically a no-op in the portal WS path (the
+                # _spec_review_handler isn't wired), but handles stale replies
+                # gracefully.
+                spec_review_id = msg.get("spec_review_id")
+                decision = msg.get("decision", "rejected")
+                feedback = msg.get("feedback", "")
+                logger.info(
+                    f"Received spec_review_reply: id={spec_review_id!r}, decision={decision!r}"
+                )
+                if spec_review_id and spec_review_id in pending_spec_reviews:
+                    fut = pending_spec_reviews[spec_review_id]
+                    if not fut.done():
+                        fut.set_result((decision, feedback))
+                    else:
+                        logger.warning(
+                            f"spec_review_reply {spec_review_id!r} already resolved"
+                        )
+                else:
+                    logger.warning(
+                        f"spec_review_reply {spec_review_id!r} not found in "
+                        f"pending_spec_reviews — ignoring"
+                    )
+                continue
+
            if msg_type == "resume":
                # Frontend reconnected and wants to resume a running task
                resume_task_id = msg.get("task_id", "")
@ -1800,15 +1868,17 @@ async def portal_websocket(websocket: WebSocket):

            # Execute via ReAct stream
            react_config = agent.get_react_config()
-            # Reuse agent's ReActEngine if available (aligned with chat.py pattern)
-            react_engine = getattr(agent, "_react_engine", None)
-            if react_engine is None:
-                react_engine = ReActEngine(
+            # KTD-4/KTD-8: route through ConfigDrivenAgent.execute_stream
+            # (evolution hooks + trace_outcome propagation in finally block).
+            _react_engine = getattr(agent, "_react_engine", None)
+            if _react_engine is None:
+                _react_engine = ReActEngine(
                    llm_gateway=llm_gateway,
                    max_steps=react_config["max_steps"],
                )
+                agent._react_engine = _react_engine
            else:
-                react_engine.reset()
+                _react_engine.reset()

            messages = [{"role": "user", "content": message_text}]
            # Inject conversation history for context continuity
@ -1829,11 +1899,8 @@ async def portal_websocket(websocket: WebSocket):
            # background task continues running and persists the result.
            bg_task = asyncio.create_task(
                _execute_react_background(
-                    react_engine=react_engine,
+                    agent=agent,
                    messages=messages,
-                    tools=tools,
-                    model=model,
-                    agent_name=agent.name,
                    system_prompt=system_prompt,
                    timeout_seconds=timeout_seconds,
                    conv_id=conv.id,
--- a/tests/unit/server/test_portal_ws_background.py
+++ b/tests/unit/server/test_portal_ws_background.py
@ -38,10 +38,12 @@ class FakeConversationStore:
 class FakeReactEngine:
    """Fake ReAct engine that yields events from a predefined list."""

+    name = "test-agent"
+
    def __init__(self, events: list[Event]) -> None:
        self._events = events

-    async def execute_stream(self, **kwargs):
+    async def execute_stream(self, task):
        for event in self._events:
            yield event

@ -49,11 +51,13 @@ class FakeReactEngine:
 class FailingReactEngine:
    """Fake ReAct engine that raises an exception after yielding some events."""

+    name = "test-agent"
+
    def __init__(self, events: list[Event], error: Exception) -> None:
        self._events = events
        self._error = error

-    async def execute_stream(self, **kwargs):
+    async def execute_stream(self, task):
        for event in self._events:
            yield event
        raise self._error
@ -76,11 +80,13 @@ def _make_event(
 class SlowFakeReactEngine:
    """Fake ReAct engine with a delay to allow status checks during execution."""

+    name = "test-agent"
+
    def __init__(self, events: list[Event], delay: float = 0.1) -> None:
        self._events = events
        self._delay = delay

-    async def execute_stream(self, **kwargs):
+    async def execute_stream(self, task):
        for event in self._events:
            await asyncio.sleep(self._delay)
            yield event
@ -93,11 +99,13 @@ class CancellableReactEngine:
    Event so the test can cancel the task and verify CancelledError cleanup.
    """

+    name = "test-agent"
+
    def __init__(self, first_event: Event) -> None:
        self._first_event = first_event
        self.started = asyncio.Event()

-    async def execute_stream(self, **kwargs):
+    async def execute_stream(self, task):
        yield self._first_event
        self.started.set()
        # Block forever until cancelled
@ -130,11 +138,8 @@ class TestExecuteReactBackground:
        eq = EventQueue()

        await _execute_react_background(
-            react_engine=engine,
+            agent=engine,
            messages=[],
-            tools=[],
-            model="test-model",
-            agent_name="test-agent",
            system_prompt=None,
            timeout_seconds=None,
            conv_id="test-conv",
@ -162,11 +167,8 @@ class TestExecuteReactBackground:
        eq = EventQueue()

        await _execute_react_background(
-            react_engine=engine,
+            agent=engine,
            messages=[],
-            tools=[],
-            model="test-model",
-            agent_name="test-agent",
            system_prompt=None,
            timeout_seconds=None,
            conv_id="test-conv",
@ -190,11 +192,8 @@ class TestExecuteReactBackground:
        eq = EventQueue()

        await _execute_react_background(
-            react_engine=engine,
+            agent=engine,
            messages=[],
-            tools=[],
-            model="test-model",
-            agent_name="test-agent",
            system_prompt=None,
            timeout_seconds=None,
            conv_id="test-conv",
@ -228,11 +227,8 @@ class TestExecuteReactBackground:
        await asyncio.sleep(0.05)

        await _execute_react_background(
-            react_engine=engine,
+            agent=engine,
            messages=[],
-            tools=[],
-            model="test-model",
-            agent_name="test-agent",
            system_prompt=None,
            timeout_seconds=None,
            conv_id="test-conv",
@ -270,11 +266,8 @@ class TestExecuteReactBackground:
        await asyncio.sleep(0.05)

        await _execute_react_background(
-            react_engine=engine,
+            agent=engine,
            messages=[],
-            tools=[],
-            model="test-model",
-            agent_name="test-agent",
            system_prompt=None,
            timeout_seconds=None,
            conv_id="test-conv",
@ -318,11 +311,8 @@ class TestTaskStoreIntegration:
        # Start background task
        bg_task = asyncio.create_task(
            _execute_react_background(
-                react_engine=engine,
+                agent=engine,
                messages=[],
-                tools=[],
-                model="test-model",
-                agent_name="test-agent",
                system_prompt=None,
                timeout_seconds=None,
                conv_id="test-conv",
@ -365,11 +355,8 @@ class TestTaskStoreIntegration:
        )

        await _execute_react_background(
-            react_engine=engine,
+            agent=engine,
            messages=[],
-            tools=[],
-            model="test-model",
-            agent_name="test-agent",
            system_prompt=None,
            timeout_seconds=None,
            conv_id="test-conv",
@ -394,11 +381,8 @@ class TestTaskStoreIntegration:

        # Should not raise
        await _execute_react_background(
-            react_engine=engine,
+            agent=engine,
            messages=[],
-            tools=[],
-            model="test-model",
-            agent_name="test-agent",
            system_prompt=None,
            timeout_seconds=None,
            conv_id="test-conv",
@ -552,11 +536,8 @@ class TestCancelledErrorPath:

        bg_task = asyncio.create_task(
            _execute_react_background(
-                react_engine=engine,
+                agent=engine,
                messages=[],
-                tools=[],
-                model="test-model",
-                agent_name="test-agent",
                system_prompt=None,
                timeout_seconds=None,
                conv_id="test-conv",
@ -590,11 +571,8 @@ class TestCancelledErrorPath:

        bg_task = asyncio.create_task(
            _execute_react_background(
-                react_engine=engine,
+                agent=engine,
                messages=[],
-                tools=[],
-                model="test-model",
-                agent_name="test-agent",
                system_prompt=None,
                timeout_seconds=None,
                conv_id="test-conv",
@ -636,11 +614,8 @@ class TestCancelledErrorPath:

        bg_task = asyncio.create_task(
            _execute_react_background(
-                react_engine=engine,
+                agent=engine,
                messages=[],
-                tools=[],
-                model="test-model",
-                agent_name="test-agent",
                system_prompt=None,
                timeout_seconds=None,
                conv_id="test-conv",
@ -769,11 +744,8 @@ class TestCancelPropagation:
        # Simulate the background task as portal.py would create it
        active_bg_task: asyncio.Task | None = asyncio.create_task(
            _execute_react_background(
-                react_engine=engine,
+                agent=engine,
                messages=[],
-                tools=[],
-                model="test-model",
-                agent_name="test-agent",
                system_prompt=None,
                timeout_seconds=None,
                conv_id="cancel-conv",
@ -814,11 +786,8 @@ class TestCancelPropagation:

        bg_task = asyncio.create_task(
            _execute_react_background(
-                react_engine=engine,
+                agent=engine,
                messages=[],
-                tools=[],
-                model="test-model",
-                agent_name="test-agent",
                system_prompt=None,
                timeout_seconds=None,
                conv_id="test-conv",
@ -865,11 +834,8 @@ class TestWebSocketDisconnectNoCancel:
        # Start the background task (as portal.py would)
        bg_task = asyncio.create_task(
            _execute_react_background(
-                react_engine=engine,
+                agent=engine,
                messages=[],
-                tools=[],
-                model="test-model",
-                agent_name="test-agent",
                system_prompt=None,
                timeout_seconds=None,
                conv_id="test-conv",
@ -912,11 +878,8 @@ class TestWebSocketDisconnectNoCancel:

        bg_task = asyncio.create_task(
            _execute_react_background(
-                react_engine=engine,
+                agent=engine,
                messages=[],
-                tools=[],
-                model="test-model",
-                agent_name="test-agent",
                system_prompt=None,
                timeout_seconds=None,
                conv_id="resume-conv",
--- a/tests/unit/test_execute_stream_hooks.py
+++ b/tests/unit/test_execute_stream_hooks.py
@ -112,7 +112,8 @@ class TestExecuteStreamHooks:
        assert events[0].event_type == "final_answer"
        assert len(fired) == 1
        assert fired[0].status == TaskStatus.COMPLETED
-        assert fired[0].output_data == {"content": "hello world"}
+        # KTD-8: output_data includes trace_outcome for lifecycle._is_failure_path()
+        assert fired[0].output_data == {"content": "hello world", "trace_outcome": "success"}

    async def test_failure_fires_on_task_failed(self):
        """Stream exception fires evolve_after_task with FAILED status."""