feat(agent): Wave 4 PLAN_EXEC Hardening (U1-U5) #7

Merged
fischer merged 8 commits from feat/agent-wave4-plan-exec-hardening into main 2026-06-30 12:46:35 +08:00
6 changed files with 19 additions and 17 deletions
Showing only changes of commit 8627777f87 - Show all commits

View File

@ -55,12 +55,6 @@ class PhaseState(enum.Enum):
# Wildcard token meaning "all tools allowed in this phase".
WILDCARD = "*"
# Legacy regex-based bash filter. Kept for backward compatibility with configs
# that pass a `re.Pattern` into `bash_command_filter`. New default uses
# `ShellTool._is_dangerous` (a Callable) which closes the regex ceiling
# (missed `:>file`, `dd of=file`, etc. — see Wave 4 U1).
_DEFAULT_BASH_FILTER = re.compile(r"\b(rm|mv|cp|mkdir|rmdir|chmod|chown)\b|(?<!\S)>|>>")
@dataclass(slots=True)
class PhasePolicy:

View File

@ -1433,8 +1433,7 @@ export const useChatStore = defineStore("chat", () => {
}
case "phase_violation": {
// Track current phase from violation data (backend doesn't emit
// phase_changed yet — U4 frontend is forward-compatible).
// Track current phase from violation data.
currentPhase.value = data.data.current_phase;
const violation = {
phase: data.data.current_phase,

View File

@ -54,7 +54,11 @@ describe('chat store — PLAN_EXEC phase state (U4)', () => {
expect(store.isPlanExec).toBe(true)
})
it('phaseViolations capped at 5 entries', async () => {
it('phaseViolations array is directly mutable for test fixtures', async () => {
// Direct mutation bypasses the capping logic in handleWsMessage;
// the cap is enforced inside the case handler, not as a setter.
// This test verifies the array is accessible; the cap-at-5 behavior
// is exercised through handleWsMessage in the U5 E2E test.
const { useChatStore } = await import('@/stores/chat')
const store = useChatStore()
for (let i = 0; i < 7; i++) {
@ -69,9 +73,6 @@ describe('chat store — PLAN_EXEC phase state (U4)', () => {
},
]
}
// Direct mutation bypasses the capping logic in handleWsMessage;
// the cap is enforced inside the case handler, not as a setter.
// This test just verifies the array is accessible.
expect(store.phaseViolations.length).toBe(7)
})
})

View File

@ -6,7 +6,7 @@ import asyncio
import hmac
import json
import logging
from typing import Any
from typing import Any, TYPE_CHECKING
import os
import uuid
@ -33,6 +33,11 @@ from agentkit.session.manager import SessionManager
from agentkit.session.models import MessageRole, SessionStatus
from agentkit.tools.advance_phase import AdvancePhaseTool
if TYPE_CHECKING:
from agentkit.llm.gateway import LLMGateway
from agentkit.server.config import ServerConfig
from agentkit.tools.base import Tool
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/chat", tags=["chat"])
@ -536,12 +541,12 @@ def _message_to_response(msg) -> MessageResponse:
def _build_phase_engine(
*,
server_config: Any,
llm_gateway: Any,
server_config: ServerConfig | None,
llm_gateway: LLMGateway,
execution_mode: ExecutionMode,
base_tools: list,
base_tools: list[Tool],
session_id: str = "",
) -> tuple[ReActEngine | None, list | None, str | None]:
) -> tuple[ReActEngine | None, list[Tool] | None, str | None]:
"""Build a PLAN_EXEC engine with PhasePolicy + AdvancePhaseTool.
Encapsulates the WS path's phase_policy construction so the REST path

View File

@ -322,6 +322,8 @@ class TestPlanExecE2EEdgeCases:
events.append(ev)
# Engine should have transitioned out of PLANNING (auto-advance fired).
# Weak assertion: auto_advance_after_steps=2 may fire multiple times
# (PLANNING→BUILDING→VERIFICATION), so we only assert it left PLANNING.
assert engine.current_phase != PhaseState.PLANNING
# All 3 search calls dispatched (search is allowed in both PLANNING and BUILDING).
assert search.call_count == 3

View File

@ -235,6 +235,7 @@ class TestRestPlanExec:
json={"content": "Hello"},
)
# 500 is acceptable (mock gateway), but it must NOT be the PLAN_EXEC error.
assert msg_resp.status_code != 501, "REACT fallback should not return 501"
if msg_resp.status_code == 500:
assert "phase policy error" not in msg_resp.json().get("detail", "")