From b8418968c224d420d1a09f6b967f83dc50c16043 Mon Sep 17 00:00:00 2001 From: chiguyong Date: Fri, 3 Jul 2026 12:32:22 +0800 Subject: [PATCH] feat(core): verification defaults for PLAN_EXEC/TEAM_COLLAB + minimum sandbox (U3, R2/R3/RV3) --- src/agentkit/core/phase.py | 43 +++- src/agentkit/core/plan_exec_engine.py | 31 +++ src/agentkit/core/react.py | 53 ++++- src/agentkit/core/sandbox.py | 166 ++++++++++++++ src/agentkit/experts/orchestrator.py | 114 +++++++++- tests/unit/test_sandbox.py | 173 ++++++++++++++ tests/unit/test_verification_defaults.py | 276 +++++++++++++++++++++++ 7 files changed, 834 insertions(+), 22 deletions(-) create mode 100644 src/agentkit/core/sandbox.py create mode 100644 tests/unit/test_sandbox.py create mode 100644 tests/unit/test_verification_defaults.py diff --git a/src/agentkit/core/phase.py b/src/agentkit/core/phase.py index d6922cc..8ece242 100644 --- a/src/agentkit/core/phase.py +++ b/src/agentkit/core/phase.py @@ -7,6 +7,11 @@ KTD3 (Wave 3 plan): state machine lives in ReActEngine, not skill config. KTD5: default whitelist matches brainstorm R24 (Planning: think/search; Building: write_file; etc.). KTD6: transitions are LLM-driven via AdvancePhaseTool; auto-advance is opt-in. + +U3 (R3): ``default_policy()`` accepts an optional ``workspace_root`` and +populates ``PhasePolicy.verification_commands`` via coding-task detection +(``pyproject.toml`` / ``.py`` presence) — coding tasks force pytest/ruff; +non-coding tasks leave the list empty for Spec-declared commands. """ from __future__ import annotations @@ -15,6 +20,7 @@ import enum import logging import re from dataclasses import dataclass, field, replace +from pathlib import Path from typing import Any, Callable from agentkit.tools.shell import ShellTool @@ -78,11 +84,17 @@ class PhasePolicy: """ whitelist: dict[PhaseState, frozenset[str]] - bash_command_filter: dict[ - PhaseState, Callable[[str], bool] | re.Pattern | None - ] = field(default_factory=dict) + bash_command_filter: dict[PhaseState, Callable[[str], bool] | re.Pattern | None] = field( + default_factory=dict + ) auto_advance_after_steps: int | None = None # None = manual (LLM calls advance_phase) start_phase: PhaseState = PhaseState.PLANNING + # U3/R3: verification commands to run at the VERIFICATION phase's final-answer + # point. Populated by default_policy() via coding-task detection. None = no + # opinion (ReActEngine falls back to its own verification_commands param or + # VerificationLoop defaults). An empty list means "no commands" (verification + # passes trivially — for non-coding tasks using Spec-declared commands instead). + verification_commands: list[str] | None = None def __post_init__(self) -> None: # Fail-fast: empty whitelist for a non-wildcard phase = bug. @@ -124,19 +136,16 @@ class PhasePolicy: return { "whitelist": {phase.value: sorted(tools) for phase, tools in self.whitelist.items()}, "bash_command_filter": { - phase.value: ( - "" - if callable(p) - else (p.pattern if p else None) - ) + phase.value: ("" if callable(p) else (p.pattern if p else None)) for phase, p in self.bash_command_filter.items() }, "auto_advance_after_steps": self.auto_advance_after_steps, "start_phase": self.start_phase.value, + "verification_commands": self.verification_commands, } -def default_policy() -> PhasePolicy: +def default_policy(workspace_root: str | Path | None = None) -> PhasePolicy: """Return the KTD5 default PhasePolicy. Whitelist (R24): @@ -151,7 +160,22 @@ def default_policy() -> PhasePolicy: operators, and the full danger taxonomy shared with the ShellTool confirmation path. - BUILDING/DELIVERY: no filter (full bash) + + U3/R3: ``verification_commands`` is populated via coding-task detection on + ``workspace_root``. Coding workspaces (``pyproject.toml`` or ``.py`` + present) force ``pytest -x -q`` and ``ruff check src/``. Non-coding + workspaces get ``None`` (no opinion — Spec-declared commands are used). """ + # U3/R3: coding-task detection. Local import avoids a circular dependency + # (sandbox.py is standalone, but keeping the import local makes the R3 + # concern visually scoped to default_policy). + from agentkit.core.sandbox import detect_verification_commands + + verification_cmds = detect_verification_commands(workspace_root) + # detect_verification_commands returns [] for non-coding workspaces. + # For non-coding workspaces, leave verification_commands as None so the + # caller knows "no coding-specific commands" and can substitute Spec-declared + # commands. For coding workspaces, set the forced pytest/ruff list. return PhasePolicy( whitelist={ # Tool name is "shell" (ShellTool default); bash_command_filter @@ -172,6 +196,7 @@ def default_policy() -> PhasePolicy: }, auto_advance_after_steps=None, # manual by default start_phase=PhaseState.PLANNING, + verification_commands=verification_cmds if verification_cmds else None, ) diff --git a/src/agentkit/core/plan_exec_engine.py b/src/agentkit/core/plan_exec_engine.py index d12d798..e9bae65 100644 --- a/src/agentkit/core/plan_exec_engine.py +++ b/src/agentkit/core/plan_exec_engine.py @@ -75,6 +75,13 @@ class PlanExecEngine: workspace: SharedWorkspace | None = None, step_event_callback: "Callable[[str, dict[str, Any]], Awaitable[None]] | None" = None, spec_manager: SpecManager | None = None, + # U3/R2: verification defaults True for PLAN_EXEC (per R2). Threaded + # through to each step's ReActEngine so the canonical verify-at-final- + # answer path (react.py:1303+) runs pytest/ruff on coding tasks. + # DIRECT_CHAT/REACT keep verification_enabled=False (the ReActEngine + # default) — only PLAN_EXEC/TEAM_COLLAB verify. + verification_enabled: bool = True, + verification_commands: list[str] | None = None, ): """ Args: @@ -84,6 +91,12 @@ class PlanExecEngine: workspace: SharedWorkspace 实例,用于步骤间状态传递 step_event_callback: 步骤事件回调,用于非流式执行时推送进度 spec_manager: SpecManager 实例,用于持久化执行计划为 Spec 文档 + verification_enabled: U3/R2 — verification-on-final-answer for + PLAN_EXEC steps. Defaults True (per R2). Each step's + ReActEngine is constructed with this flag. + verification_commands: Optional override for the verification + commands. None = let ReActEngine / VerificationLoop use its + own defaults (pytest -x -q, ruff check src/). """ self._llm_gateway = llm_gateway self._max_replans = max_replans @@ -92,6 +105,8 @@ class PlanExecEngine: self._step_event_callback = step_event_callback self._spec_manager = spec_manager self._confirmation_handler: Any | None = None + self._verification_enabled = verification_enabled + self._verification_commands = verification_commands # 组合子组件 self._planner = GoalPlanner(llm_gateway=llm_gateway) @@ -929,6 +944,8 @@ class PlanExecEngine: system_prompt=system_prompt, tools=tools, confirmation_handler=self._confirmation_handler, + verification_enabled=self._verification_enabled, + verification_commands=self._verification_commands, ) return PlanExecutor( agent_pool=step_executor, @@ -1089,6 +1106,8 @@ class ReActStepExecutor: tools: list["Tool"] | None = None, max_steps: int = 5, confirmation_handler: Any | None = None, + verification_enabled: bool = False, + verification_commands: list[str] | None = None, ): self._llm_gateway = llm_gateway self._messages = messages or [] @@ -1097,6 +1116,8 @@ class ReActStepExecutor: self._tools = tools or [] self._max_steps = max_steps self._confirmation_handler = confirmation_handler + self._verification_enabled = verification_enabled + self._verification_commands = verification_commands self._agents: dict[str, _ReActStepAgent] = {} async def create_agent_from_skill(self, skill_name: str): @@ -1110,6 +1131,8 @@ class ReActStepExecutor: tools=self._tools, max_steps=self._max_steps, confirmation_handler=self._confirmation_handler, + verification_enabled=self._verification_enabled, + verification_commands=self._verification_commands, ) self._agents[skill_name] = agent return agent @@ -1126,6 +1149,8 @@ class ReActStepExecutor: system_prompt=self._system_prompt, tools=self._tools, max_steps=self._max_steps, + verification_enabled=self._verification_enabled, + verification_commands=self._verification_commands, ) self._agents[key] = agent return agent @@ -1148,6 +1173,8 @@ class _ReActStepAgent: tools: list["Tool"] | None = None, max_steps: int = 5, confirmation_handler: Any | None = None, + verification_enabled: bool = False, + verification_commands: list[str] | None = None, ): self.name = name self._llm_gateway = llm_gateway @@ -1157,6 +1184,8 @@ class _ReActStepAgent: self._tools = tools or [] self._max_steps = max_steps self._confirmation_handler = confirmation_handler + self._verification_enabled = verification_enabled + self._verification_commands = verification_commands async def execute(self, task_msg: TaskMessage) -> "TaskResult": """执行步骤:通过 ReActEngine 循环调用""" @@ -1184,6 +1213,8 @@ class _ReActStepAgent: engine = ReActEngine( llm_gateway=self._llm_gateway, max_steps=self._max_steps, + verification_enabled=self._verification_enabled, + verification_commands=self._verification_commands, ) # 构建 messages diff --git a/src/agentkit/core/react.py b/src/agentkit/core/react.py index 31f0f87..7d5c87f 100644 --- a/src/agentkit/core/react.py +++ b/src/agentkit/core/react.py @@ -23,6 +23,7 @@ from agentkit.core.exceptions import ( ) from agentkit.core.protocol import CancellationToken from agentkit.core.compressor import estimate_text_tokens +from agentkit.core.sandbox import SandboxNetworkBlockedError from agentkit.llm.gateway import LLMGateway from agentkit.llm.protocol import LLMResponse from agentkit.tools.base import Tool, ToolValidationError @@ -36,6 +37,7 @@ if TYPE_CHECKING: from agentkit.core.compressor import CompressionStrategy from agentkit.core.middleware import MiddlewareChain from agentkit.core.phase import PhasePolicy, PhaseState + from agentkit.core.sandbox import WorkspaceSandbox from agentkit.core.trace import TraceRecorder from agentkit.memory.retriever import MemoryRetriever @@ -182,6 +184,11 @@ class ReActEngine: # U3/G6: PLAN_EXEC phase policy (opt-in). None = no enforcement # (backward compat — all existing callers unaffected). phase_policy: "PhasePolicy | None" = None, + # U3/RV3: minimum sandbox. When set and the engine is in VERIFICATION + # phase, tool execution is wrapped in sandbox.network_block() so tools + # cannot make outbound network calls during verification. None = no + # sandbox (backward compat for DIRECT_CHAT/REACT and existing tests). + sandbox: "WorkspaceSandbox | None" = None, ): if max_steps < 1: raise ValueError(f"max_steps must be >= 1, got {max_steps}") @@ -194,7 +201,16 @@ class ReActEngine: self._default_timeout = default_timeout self._parallel_tools = parallel_tools self._verification_enabled = verification_enabled - self._verification_commands = verification_commands + # U3/R3: if no explicit verification_commands were passed but the + # phase_policy carries coding-task-detected commands (from + # default_policy(workspace_root)), inherit them. Explicit param wins + # so callers can override per-engine. + if verification_commands is not None: + self._verification_commands = verification_commands + elif phase_policy is not None and phase_policy.verification_commands: + self._verification_commands = list(phase_policy.verification_commands) + else: + self._verification_commands = verification_commands # U2/G2: prompt cache 双块结构开关(True 时 Anthropic 用 cache_control blocks, # 其他 provider 走字符串拼接依赖自动前缀缓存) self._prompt_cache_enable = prompt_cache_enable @@ -240,6 +256,9 @@ class ReActEngine: # simply ignores the accumulator (the error dict returned to the LLM is # the only signal there). self._phase_violations: list[dict[str, object]] = [] + # U3/RV3: minimum sandbox. When set and current phase is VERIFICATION, + # _execute_tool wraps tool.safe_execute() in sandbox.network_block(). + self._sandbox = sandbox def reset(self) -> None: """Reset internal state for reuse across conversations. @@ -1805,9 +1824,39 @@ class ReActEngine: # Strip internal metadata keys before passing to tool clean_args = {k: v for k, v in arguments.items() if not k.startswith("_")} + # U3/RV3: sandbox network block during VERIFICATION phase. When a + # sandbox is configured and the engine is in VERIFICATION, wrap the + # tool call so outbound network access is rejected. The error is + # returned as a structured dict (the loop continues — the LLM sees + # the rejection and can adjust). Other phases and no-sandbox engines + # are unaffected (backward compat). + in_verification = ( + self._sandbox is not None + and self._current_phase is not None + and self._current_phase.value == "verification" + ) + try: - result = await tool.safe_execute(**clean_args) + if in_verification: + async with self._sandbox.network_block(): + result = await tool.safe_execute(**clean_args) + else: + result = await tool.safe_execute(**clean_args) return result + except SandboxNetworkBlockedError as e: + # Structured error so the LLM understands *why* the call was + # rejected and can react (e.g. switch to a local-only approach). + error_msg = ( + f"Tool '{tool_name}' blocked by sandbox: network access is " + f"not allowed during VERIFICATION phase" + ) + logger.info("sandbox: %s blocked (%s)", tool_name, e) + return { + "error": error_msg, + "error_code": "sandbox_network_blocked", + "current_phase": "verification", + "tool": tool_name, + } except ToolValidationError as e: # 保留类型化错误码,不被通用 except 平坦化为字符串 error_msg = f"Tool '{tool_name}' schema validation failed: {e}" diff --git a/src/agentkit/core/sandbox.py b/src/agentkit/core/sandbox.py new file mode 100644 index 0000000..759aa0a --- /dev/null +++ b/src/agentkit/core/sandbox.py @@ -0,0 +1,166 @@ +"""Minimum sandbox enforcement for VERIFICATION phase (U3, RV3). + +Two concerns: + +1. **Workspace-write path enforcement** — reuses the 3-layer path validation + pattern from ``str_replace_editor.py`` (U1): reject absolute paths, reject + ``..`` traversal, and verify ``Path.resolve()`` stays within the workspace + root (catches symlink escape). + +2. **Network blocking** — an async context manager that patches + ``socket.socket.connect`` / ``connect_ex`` to raise during VERIFICATION + tool calls. This catches ``httpx`` / ``requests`` / ``urllib`` at their + common chokepoint (the stdlib socket layer). + + ponytail: process-wide socket patch — not subprocess-safe. A ``bash`` tool + spawning ``curl`` bypasses this because the child gets its own socket + namespace from the OS. Upgrade path: OS-level network namespace isolation + (``unshare -n`` / netns) or a seccomp filter on ``socket(2)``. The context + manager is sufficient for in-process tool calls (the stated RV3 scope). + +Full tiering (read-only / workspace-write / danger) is deferred — this module +implements only the minimum: workspace-write + no-network. +""" + +from __future__ import annotations + +import contextlib +import errno +import logging +import socket +from pathlib import Path + +logger = logging.getLogger(__name__) + + +class SandboxNetworkBlockedError(RuntimeError): + """Raised when a tool attempts an outbound network call under sandbox.""" + + +class WorkspaceSandbox: + """Minimum sandbox: workspace-write path enforcement + network blocking. + + Construct once per engine (or per VERIFICATION phase) and reuse. The + ``validate_path`` method is sync (cheap, no I/O). The ``network_block`` + context manager is async because it is used around ``await tool.execute()``. + """ + + def __init__(self, workspace_root: str | Path) -> None: + # Resolve once so prefix checks compare against a stable, real + # directory (no symlink inside the workspace root itself). + self._workspace_root: Path = Path(workspace_root).resolve() + + @property + def workspace_root(self) -> Path: + return self._workspace_root + + # ── path validation (reuses U1 str_replace_editor 3-layer pattern) ── + + def validate_path(self, raw_path: str) -> Path: + """Resolve ``raw_path`` against the workspace root and verify confinement. + + Returns the resolved absolute ``Path`` on success. Raises ``ValueError`` + if the path is absolute, contains a ``..`` component, or resolves + outside the workspace root (path traversal or symlink escape). + + Mirrors ``StrReplaceEditorTool._resolve_within_workspace`` but raises + instead of returning ``None`` — this is the security boundary, so a + loud exception is the right signal for misuse from internal callers. + """ + if not isinstance(raw_path, str) or not raw_path: + raise ValueError("sandbox: path must be a non-empty string") + p = Path(raw_path) + if p.is_absolute(): + raise ValueError( + f"sandbox: absolute paths are rejected ({raw_path!r}); " + f"use a path relative to the workspace root" + ) + if ".." in p.parts: + raise ValueError(f"sandbox: path traversal ('..') is rejected ({raw_path!r})") + resolved = (self._workspace_root / raw_path).resolve() + try: + resolved.relative_to(self._workspace_root) + except ValueError as e: + raise ValueError( + f"sandbox: path {raw_path!r} resolves outside the workspace " + f"root ({self._workspace_root})" + ) from e + return resolved + + # ── coding-workspace detection ───────────────────────────────────── + + def is_coding_workspace(self) -> bool: + """Return True if the workspace looks like a Python coding project. + + Heuristic: ``pyproject.toml`` OR any ``.py`` file in the workspace root + (non-recursive scan of the top level — cheap, O(dirent count)). + ponytail: top-level scan only — a ``.py`` file nested 3 levels deep + is missed. Upgrade path: recursive walk with a depth cap, or trust + ``pyproject.toml`` as the single signal (which it nearly always is). + """ + if (self._workspace_root / "pyproject.toml").exists(): + return True + try: + for entry in self._workspace_root.iterdir(): + if entry.is_file() and entry.suffix == ".py": + return True + except (PermissionError, OSError) as e: + logger.warning("sandbox: failed to scan workspace root: %s", e) + return False + + # ── network blocking ─────────────────────────────────────────────── + + @contextlib.asynccontextmanager + async def network_block(self): + """Block outbound network connections within the async context. + + Patches ``socket.socket.connect`` and ``connect_ex`` to raise / + return ``ECONNREFUSED`` respectively. Restores the originals on exit, + even if the wrapped code raises. + + Already-connected sockets (e.g. an LLM gateway keep-alive pool) are + unaffected — only *new* ``connect()`` calls are blocked. This is the + correct granularity: the LLM gateway talks over its existing + connection, while a tool trying to ``requests.get(...)`` makes a new + connect and is rejected. + """ + original_connect = socket.socket.connect + original_connect_ex = socket.socket.connect_ex + + def _blocked_connect(self_sock, *args, **kwargs): # noqa: ANN001 + raise SandboxNetworkBlockedError( + "Network access blocked by sandbox during VERIFICATION phase" + ) + + def _blocked_connect_ex(self_sock, *args, **kwargs): # noqa: ANN001 + # connect_ex returns an errno instead of raising (POSIX contract). + return errno.ECONNREFUSED + + socket.socket.connect = _blocked_connect # type: ignore[method-assign] + socket.socket.connect_ex = _blocked_connect_ex # type: ignore[method-assign] + logger.debug("sandbox: network block engaged") + try: + yield + finally: + socket.socket.connect = original_connect # type: ignore[method-assign] + socket.socket.connect_ex = original_connect_ex # type: ignore[method-assign] + logger.debug("sandbox: network block released") + + +def detect_verification_commands(workspace_root: str | Path | None) -> list[str]: + """Return the verification commands appropriate for the workspace. + + Coding workspaces (``pyproject.toml`` or ``.py`` present) force + ``pytest -x -q`` and ``ruff check src/`` (R3). Non-coding workspaces return + an empty list — the caller (VerificationLoop) then falls back to its own + default, or the Spec-declared verification commands are used. + + A ``None`` workspace returns an empty list (conservative: don't assume a + coding project without evidence). + """ + if workspace_root is None: + return [] + sandbox = WorkspaceSandbox(workspace_root) + if sandbox.is_coding_workspace(): + return ["pytest -x -q", "ruff check src/"] + return [] diff --git a/src/agentkit/experts/orchestrator.py b/src/agentkit/experts/orchestrator.py index f5b973a..f325bc1 100644 --- a/src/agentkit/experts/orchestrator.py +++ b/src/agentkit/experts/orchestrator.py @@ -74,6 +74,11 @@ class TeamOrchestrator( checkpoint: object | None = None, workspace_root: str | None = None, rollback_timeout: float | None = None, + # U3/R2: verification defaults True for TEAM_COLLAB (per R2). Applied + # to each phase's isolated agent engine so the canonical verify-at- + # final-answer path (react.py:1303+) runs on coding tasks. + verification_enabled: bool = True, + verification_commands: list[str] | None = None, ) -> None: self._team = team # Track temporary agent names created for context isolation (KTD3) @@ -93,6 +98,47 @@ class TeamOrchestrator( # Both default to no-op-friendly values so existing call sites behave identically. self._workspace_root = workspace_root self._rollback_timeout = rollback_timeout or self.DEFAULT_ROLLBACK_TIMEOUT + # U3/R2: verification defaults for TEAM_COLLAB. + self._verification_enabled = verification_enabled + # U3/R3: if no explicit commands, detect from workspace (coding-task + # detection forces pytest/ruff). None workspace → None commands → + # ReActEngine/VerificationLoop uses its own defaults. + if verification_commands is not None: + self._verification_commands = verification_commands + else: + from agentkit.core.sandbox import detect_verification_commands + + self._verification_commands = detect_verification_commands(workspace_root) or None + + async def _get_isolated_agent(self, expert: Expert, phase: PlanPhase): + """Override to apply verification defaults to freshly created agents. + + Calls the mixin's ``_get_isolated_agent`` (which creates an isolated + ConfigDrivenAgent via the pool), then — for freshly created temp agents + only (not the shared fallback ``expert.agent``) — flips the engine's + ``_verification_enabled`` flag and sets ``_verification_commands`` so + the canonical verify-at-final-answer path runs for TEAM_COLLAB. + + We mutate the engine's private attributes directly because the pool + constructs the ReActEngine without a verification_enabled parameter + (the pool is shared across modes). The temp agent is cleaned up after + the phase, so this mutation is scoped and does not leak into other + team executions or the shared expert agent. + """ + agent = await super()._get_isolated_agent(expert, phase) + # Only configure freshly-created temp agents (not the shared fallback). + # _temp_agents[phase.id] is set by the mixin only on successful + # pool.create_agent — its presence means this is a fresh agent. + if ( + self._verification_enabled + and phase.id in self._temp_agents + and getattr(agent, "_react_engine", None) is not None + ): + engine = agent._react_engine # type: ignore[attr-defined] + engine._verification_enabled = True # type: ignore[attr-defined] + if self._verification_commands is not None: + engine._verification_commands = self._verification_commands # type: ignore[attr-defined] # noqa: E501 + return agent async def execute(self, task: str) -> dict[str, object]: """Execute a task in pipeline mode. Lead decomposes → topological sort → @@ -169,7 +215,14 @@ class TeamOrchestrator( if self._checkpoint is not None: try: await self._checkpoint.save_plan(plan) - except (ConnectionError, OSError, asyncio.TimeoutError, RuntimeError, ValueError, KeyError) as e: + except ( + ConnectionError, + OSError, + asyncio.TimeoutError, + RuntimeError, + ValueError, + KeyError, + ) as e: logger.warning(f"Checkpoint save_plan failed: {e}") # 4. Set EXECUTING status, execute phases @@ -266,7 +319,14 @@ class TeamOrchestrator( if should_save_checkpoint and self._checkpoint is not None: try: await self._checkpoint.save(plan.id, ph, plan.status.value) - except (ConnectionError, OSError, asyncio.TimeoutError, RuntimeError, ValueError, KeyError) as e: + except ( + ConnectionError, + OSError, + asyncio.TimeoutError, + RuntimeError, + ValueError, + KeyError, + ) as e: logger.warning(f"Checkpoint save failed for phase {ph.id}: {e}") # U3: Divergence detection — check completed phases for conflicts @@ -289,6 +349,7 @@ class TeamOrchestrator( # U3: 流式综合 — 每个 chunk 广播 team_synthesis_chunk # P2 fix: 携带 synthesis_id 让前端去重 streaming milestone(避免附身到上一次孤儿) synthesis_id = f"{plan.id}:synthesis" + async def _broadcast_synthesis_chunk(data: dict[str, object]) -> None: # data 可能是 {"chunk": "..."} 或 {"value": "..."}(synthesizer 决定) # 统一注入 synthesis_id,不破坏原 data 结构 @@ -306,18 +367,27 @@ class TeamOrchestrator( except asyncio.CancelledError: await self._broadcast_event( "team_synthesis", - {"content": "", "phases_completed": len(completed), - "phases_total": len(plan.phases), "status": "cancelled", - "synthesis_id": synthesis_id}, + { + "content": "", + "phases_completed": len(completed), + "phases_total": len(plan.phases), + "status": "cancelled", + "synthesis_id": synthesis_id, + }, ) raise except Exception as synth_err: logger.error(f"Synthesis streaming failed: {synth_err}") await self._broadcast_event( "team_synthesis", - {"content": "", "phases_completed": len(completed), - "phases_total": len(plan.phases), "status": "error", - "error": str(synth_err), "synthesis_id": synthesis_id}, + { + "content": "", + "phases_completed": len(completed), + "phases_total": len(plan.phases), + "status": "error", + "error": str(synth_err), + "synthesis_id": synthesis_id, + }, ) raise # 让外层 except 决定是否 fallback @@ -345,7 +415,14 @@ class TeamOrchestrator( if self._checkpoint is not None: try: await self._checkpoint.clear(plan.id) - except (ConnectionError, OSError, asyncio.TimeoutError, RuntimeError, ValueError, KeyError) as e: + except ( + ConnectionError, + OSError, + asyncio.TimeoutError, + RuntimeError, + ValueError, + KeyError, + ) as e: logger.warning(f"Checkpoint clear failed: {e}") return { @@ -363,7 +440,15 @@ class TeamOrchestrator( return await self._fallback_to_single_agent(task, plan, phase_results) except asyncio.CancelledError: raise - except (RuntimeError, ValueError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError, LLMProviderError) as e: + except ( + RuntimeError, + ValueError, + KeyError, + AttributeError, + ConnectionError, + asyncio.TimeoutError, + LLMProviderError, + ) as e: logger.error(f"Pipeline execution failed: {e}") plan.status = PlanStatus.FAILED await self._broadcast_event("team_dissolved", {"team_id": self._team.team_id}) @@ -500,7 +585,14 @@ class TeamOrchestrator( if phases: return phases logger.warning("LLM decomposition returned no valid phases") - except (LLMProviderError, asyncio.TimeoutError, ConnectionError, json.JSONDecodeError, ValueError, TypeError) as e: + except ( + LLMProviderError, + asyncio.TimeoutError, + ConnectionError, + json.JSONDecodeError, + ValueError, + TypeError, + ) as e: logger.warning(f"LLM task decomposition failed: {e}") return [PlanPhase(name="执行", assigned_expert=lead.config.name, task_description=task)] diff --git a/tests/unit/test_sandbox.py b/tests/unit/test_sandbox.py new file mode 100644 index 0000000..cdf100f --- /dev/null +++ b/tests/unit/test_sandbox.py @@ -0,0 +1,173 @@ +"""Unit tests for the minimum sandbox (U3, RV3). + +Covers: +- WorkspaceSandbox.validate_path — happy path + 3-layer security (absolute, + ``..`` traversal, symlink escape) +- WorkspaceSandbox.is_coding_workspace — pyproject.toml / .py detection +- WorkspaceSandbox.network_block — socket connect blocked inside context, + restored after exit, no effect outside +- detect_verification_commands — coding / non-coding / None workspace +""" + +from __future__ import annotations + +import socket +from pathlib import Path + +import pytest + +from agentkit.core.sandbox import ( + SandboxNetworkBlockedError, + WorkspaceSandbox, + detect_verification_commands, +) + + +# ── fixtures ────────────────────────────────────────────────────────── + + +@pytest.fixture +def workspace(tmp_path: Path) -> Path: + return tmp_path + + +@pytest.fixture +def sandbox(workspace: Path) -> WorkspaceSandbox: + return WorkspaceSandbox(workspace_root=workspace) + + +# ── validate_path ───────────────────────────────────────────────────── + + +def test_validate_path_resolves_relative(sandbox: WorkspaceSandbox, workspace: Path) -> None: + resolved = sandbox.validate_path("src/main.py") + assert resolved == (workspace / "src" / "main.py").resolve() + + +def test_validate_path_rejects_absolute(sandbox: WorkspaceSandbox) -> None: + with pytest.raises(ValueError, match="absolute paths are rejected"): + sandbox.validate_path("/etc/passwd") + + +def test_validate_path_rejects_traversal(sandbox: WorkspaceSandbox) -> None: + with pytest.raises(ValueError, match="path traversal"): + sandbox.validate_path("../../etc/passwd") + + +def test_validate_path_rejects_empty(sandbox: WorkspaceSandbox) -> None: + with pytest.raises(ValueError, match="non-empty string"): + sandbox.validate_path("") + + +def test_validate_path_rejects_symlink_escape( + sandbox: WorkspaceSandbox, workspace: Path, tmp_path_factory: pytest.TempPathFactory +) -> None: + outside = tmp_path_factory.mktemp("outside") + link = workspace / "escape" + link.symlink_to(outside) + with pytest.raises(ValueError, match="resolves outside the workspace"): + sandbox.validate_path("escape/secret.txt") + + +def test_validate_path_allows_nested(sandbox: WorkspaceSandbox, workspace: Path) -> None: + resolved = sandbox.validate_path("a/b/c/d.txt") + assert resolved == (workspace / "a" / "b" / "c" / "d.txt").resolve() + + +# ── is_coding_workspace ─────────────────────────────────────────────── + + +def test_is_coding_workspace_pyproject(sandbox: WorkspaceSandbox, workspace: Path) -> None: + (workspace / "pyproject.toml").write_text("[project]\nname='x'\n") + assert sandbox.is_coding_workspace() is True + + +def test_is_coding_workspace_py_file(sandbox: WorkspaceSandbox, workspace: Path) -> None: + (workspace / "main.py").write_text("print('hi')") + assert sandbox.is_coding_workspace() is True + + +def test_is_coding_workspace_empty(sandbox: WorkspaceSandbox) -> None: + assert sandbox.is_coding_workspace() is False + + +def test_is_coding_workspace_non_python(sandbox: WorkspaceSandbox, workspace: Path) -> None: + (workspace / "README.md").write_text("# not python") + (workspace / "index.js").write_text("console.log('hi')") + assert sandbox.is_coding_workspace() is False + + +# ── network_block ───────────────────────────────────────────────────── + + +async def test_network_block_blocks_connect(sandbox: WorkspaceSandbox) -> None: + async with sandbox.network_block(): + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + with pytest.raises(SandboxNetworkBlockedError, match="blocked by sandbox"): + sock.connect(("127.0.0.1", 1)) + finally: + sock.close() + + +async def test_network_block_restores_after_exit(sandbox: WorkspaceSandbox) -> None: + original = socket.socket.connect + async with sandbox.network_block(): + assert socket.socket.connect is not original + assert socket.socket.connect is original + + +async def test_network_block_restores_on_exception(sandbox: WorkspaceSandbox) -> None: + original = socket.socket.connect + with pytest.raises(RuntimeError, match="boom"): + async with sandbox.network_block(): + raise RuntimeError("boom") + assert socket.socket.connect is original + + +async def test_network_block_connect_ex_returns_errno(sandbox: WorkspaceSandbox) -> None: + import errno as errno_mod + + async with sandbox.network_block(): + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + rc = sock.connect_ex(("127.0.0.1", 1)) + assert rc == errno_mod.ECONNREFUSED + finally: + sock.close() + + +async def test_no_network_block_outside_context(sandbox: WorkspaceSandbox) -> None: + """Sockets connect normally when the block is not active.""" + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + # connect_ex to a closed port returns ECONNREFUSED, not the sandbox error. + rc = sock.connect_ex(("127.0.0.1", 1)) + assert rc != 0 # some connection error (expected — nothing listening) + # The key assertion: no SandboxNetworkBlockedError was raised, meaning + # the block is not active outside its context. + finally: + sock.close() + + +# ── detect_verification_commands ────────────────────────────────────── + + +def test_detect_verification_commands_coding(workspace: Path) -> None: + (workspace / "pyproject.toml").write_text("[project]\nname='x'\n") + cmds = detect_verification_commands(workspace) + assert cmds == ["pytest -x -q", "ruff check src/"] + + +def test_detect_verification_commands_non_coding(workspace: Path) -> None: + (workspace / "README.md").write_text("# docs only") + cmds = detect_verification_commands(workspace) + assert cmds == [] + + +def test_detect_verification_commands_none() -> None: + assert detect_verification_commands(None) == [] + + +def test_detect_verification_commands_empty_workspace(workspace: Path) -> None: + assert detect_verification_commands(workspace) == [] diff --git a/tests/unit/test_verification_defaults.py b/tests/unit/test_verification_defaults.py new file mode 100644 index 0000000..1dac974 --- /dev/null +++ b/tests/unit/test_verification_defaults.py @@ -0,0 +1,276 @@ +"""Unit tests for verification defaults (U3, R2/R3) + sandbox integration. + +Covers: +- default_policy(workspace_root) — coding-task detection sets verification_commands +- PhasePolicy.verification_commands field — default None, to_dict() round-trip +- PlanExecEngine — verification_enabled defaults True (R2), thread-through +- TeamOrchestrator — verification_enabled defaults True (R2) +- ReActEngine — verification_commands inherited from phase_policy; default + verification_enabled stays False (RV2 — DIRECT_CHAT/REACT do not verify) +- ReActEngine._execute_tool — sandbox blocks network during VERIFICATION, + no block in other phases or when sandbox is None +""" + +from __future__ import annotations + +from pathlib import Path +from unittest.mock import AsyncMock, MagicMock + +from agentkit.core.phase import PhasePolicy, PhaseState, WILDCARD, default_policy +from agentkit.core.plan_exec_engine import PlanExecEngine, ReActStepExecutor +from agentkit.core.react import ReActEngine +from agentkit.core.sandbox import WorkspaceSandbox +from agentkit.tools.base import Tool + + +# ── helpers ─────────────────────────────────────────────────────────── + + +def make_mock_gateway() -> MagicMock: + """A minimal mock LLMGateway for ReActEngine construction.""" + from agentkit.llm.gateway import LLMGateway + + gateway = MagicMock(spec=LLMGateway) + gateway.chat = AsyncMock(return_value=MagicMock()) + return gateway + + +class _NetworkTool(Tool): + """A test tool that attempts a socket connect — used to verify the sandbox + network block is active during VERIFICATION. + + Catches ``OSError`` (e.g. ``ConnectionRefusedError``) so that when the + sandbox is NOT active, the tool returns a normal result dict. When the + sandbox IS active, ``SandboxNetworkBlockedError`` (a ``RuntimeError``, + not an ``OSError``) propagates past this catch to ``_execute_tool``'s + dedicated handler. + """ + + def __init__(self) -> None: + super().__init__( + name="net_tool", + description="test tool that connects a socket", + input_schema={"type": "object", "properties": {}, "additionalProperties": False}, + ) + + async def execute(self, **kwargs) -> dict[str, object]: + import socket + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + try: + sock.connect(("127.0.0.1", 1)) + except OSError as e: + # Normal connection refusal (no listener) — proves the sandbox + # did NOT intercept the connect. + return {"ok": False, "error": type(e).__name__} + finally: + sock.close() + return {"ok": True} + + +# ── default_policy + PhasePolicy.verification_commands ──────────────── + + +def test_default_policy_no_workspace_has_none_commands() -> None: + policy = default_policy() + assert policy.verification_commands is None + + +def test_default_policy_coding_workspace_forces_pytest_ruff(tmp_path: Path) -> None: + (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n") + policy = default_policy(workspace_root=tmp_path) + assert policy.verification_commands == ["pytest -x -q", "ruff check src/"] + + +def test_default_policy_non_coding_workspace_has_none_commands(tmp_path: Path) -> None: + (tmp_path / "README.md").write_text("# docs only") + policy = default_policy(workspace_root=tmp_path) + assert policy.verification_commands is None + + +def test_default_policy_empty_workspace_has_none_commands(tmp_path: Path) -> None: + policy = default_policy(workspace_root=tmp_path) + assert policy.verification_commands is None + + +def test_phase_policy_verification_commands_defaults_none() -> None: + policy = PhasePolicy( + whitelist={PhaseState.PLANNING: frozenset({WILDCARD})}, + ) + assert policy.verification_commands is None + + +def test_phase_policy_to_dict_includes_verification_commands() -> None: + policy = PhasePolicy( + whitelist={PhaseState.PLANNING: frozenset({WILDCARD})}, + verification_commands=["pytest -x -q"], + ) + d = policy.to_dict() + assert d["verification_commands"] == ["pytest -x -q"] + + +# ── PlanExecEngine defaults (R2) ────────────────────────────────────── + + +def test_plan_exec_engine_verification_enabled_defaults_true() -> None: + engine = PlanExecEngine(llm_gateway=None) + assert engine._verification_enabled is True + + +def test_plan_exec_engine_verification_enabled_can_be_disabled() -> None: + engine = PlanExecEngine(llm_gateway=None, verification_enabled=False) + assert engine._verification_enabled is False + + +def test_plan_exec_engine_verification_commands_threaded() -> None: + cmds = ["pytest -x -q", "ruff check src/"] + engine = PlanExecEngine(llm_gateway=None, verification_commands=cmds) + assert engine._verification_commands == cmds + + +def test_react_step_executor_threads_verification_params() -> None: + executor = ReActStepExecutor( + verification_enabled=True, + verification_commands=["pytest"], + ) + assert executor._verification_enabled is True + assert executor._verification_commands == ["pytest"] + + +# ── TeamOrchestrator defaults (R2) ──────────────────────────────────── + + +def test_team_orchestrator_verification_enabled_defaults_true() -> None: + from agentkit.experts.orchestrator import TeamOrchestrator + from agentkit.experts.team import ExpertTeam + + team = MagicMock(spec=ExpertTeam) + orch = TeamOrchestrator(team=team) + assert orch._verification_enabled is True + + +def test_team_orchestrator_verification_can_be_disabled() -> None: + from agentkit.experts.orchestrator import TeamOrchestrator + from agentkit.experts.team import ExpertTeam + + team = MagicMock(spec=ExpertTeam) + orch = TeamOrchestrator(team=team, verification_enabled=False) + assert orch._verification_enabled is False + + +def test_team_orchestrator_detects_commands_from_workspace(tmp_path: Path) -> None: + from agentkit.experts.orchestrator import TeamOrchestrator + from agentkit.experts.team import ExpertTeam + + (tmp_path / "pyproject.toml").write_text("[project]\nname='x'\n") + team = MagicMock(spec=ExpertTeam) + orch = TeamOrchestrator(team=team, workspace_root=str(tmp_path)) + assert orch._verification_commands == ["pytest -x -q", "ruff check src/"] + + +# ── ReActEngine: verification_commands inheritance + default (RV2) ──── + + +def test_react_engine_default_verification_enabled_stays_false() -> None: + """RV2: DIRECT_CHAT/REACT do not verify by default.""" + engine = ReActEngine(llm_gateway=make_mock_gateway()) + assert engine._verification_enabled is False + + +def test_react_engine_inherits_verification_commands_from_phase_policy() -> None: + policy = PhasePolicy( + whitelist={PhaseState.PLANNING: frozenset({WILDCARD})}, + verification_commands=["pytest -x -q", "ruff check src/"], + ) + engine = ReActEngine( + llm_gateway=make_mock_gateway(), + phase_policy=policy, + ) + assert engine._verification_commands == ["pytest -x -q", "ruff check src/"] + + +def test_react_engine_explicit_commands_override_phase_policy() -> None: + policy = PhasePolicy( + whitelist={PhaseState.PLANNING: frozenset({WILDCARD})}, + verification_commands=["pytest -x -q", "ruff check src/"], + ) + engine = ReActEngine( + llm_gateway=make_mock_gateway(), + phase_policy=policy, + verification_commands=["echo custom"], + ) + assert engine._verification_commands == ["echo custom"] + + +def test_react_engine_no_policy_no_commands() -> None: + engine = ReActEngine(llm_gateway=make_mock_gateway()) + assert engine._verification_commands is None + + +# ── ReActEngine._execute_tool sandbox integration (RV3) ─────────────── + + +async def test_execute_tool_blocks_network_in_verification_phase() -> None: + """Sandbox blocks a tool's network call during VERIFICATION phase and + returns a structured error instead of raising.""" + policy = PhasePolicy( + whitelist={ + PhaseState.VERIFICATION: frozenset({"net_tool"}), + PhaseState.PLANNING: frozenset({WILDCARD}), + }, + start_phase=PhaseState.VERIFICATION, + ) + sandbox = WorkspaceSandbox(workspace_root=Path("/tmp")) + engine = ReActEngine( + llm_gateway=make_mock_gateway(), + phase_policy=policy, + sandbox=sandbox, + ) + tool = _NetworkTool() + result = await engine._execute_tool("net_tool", {}, [tool]) + assert result["error_code"] == "sandbox_network_blocked" + assert result["current_phase"] == "verification" + assert result["tool"] == "net_tool" + + +async def test_execute_tool_no_block_outside_verification() -> None: + """Sandbox does not block tool calls in non-VERIFICATION phases.""" + policy = PhasePolicy( + whitelist={ + PhaseState.PLANNING: frozenset({"net_tool"}), + PhaseState.VERIFICATION: frozenset({WILDCARD}), + }, + start_phase=PhaseState.PLANNING, + ) + sandbox = WorkspaceSandbox(workspace_root=Path("/tmp")) + engine = ReActEngine( + llm_gateway=make_mock_gateway(), + phase_policy=policy, + sandbox=sandbox, + ) + tool = _NetworkTool() + # In PLANNING phase, the tool should attempt the connect and fail with + # a connection error (not sandbox block). The connect to port 1 on + # localhost will fail with ECONNREFUSED — we just assert it's NOT the + # sandbox error code. + result = await engine._execute_tool("net_tool", {}, [tool]) + assert result.get("error_code") != "sandbox_network_blocked" + + +async def test_execute_tool_no_sandbox_no_block() -> None: + """No sandbox configured → no network blocking even in VERIFICATION.""" + policy = PhasePolicy( + whitelist={ + PhaseState.VERIFICATION: frozenset({"net_tool"}), + PhaseState.PLANNING: frozenset({WILDCARD}), + }, + start_phase=PhaseState.VERIFICATION, + ) + engine = ReActEngine( + llm_gateway=make_mock_gateway(), + phase_policy=policy, + sandbox=None, + ) + tool = _NetworkTool() + result = await engine._execute_tool("net_tool", {}, [tool]) + assert result.get("error_code") != "sandbox_network_blocked"