518 lines
21 KiB
Python
518 lines
21 KiB
Python
"""Tests for U8: spec review gate (R8).
|
|
|
|
Covers:
|
|
- Happy path (AE4): PLAN_EXEC pauses for review, user approves, execution resumes
|
|
- Rejection -> replan -> re-review; replan cap (2) -> failure (not infinite loop)
|
|
- Timeout -> Spec parked (not failed); ReActResult status="parked"
|
|
- Stream cancelled mid-review -> CancelledError propagates, no deadlock
|
|
- spec_review_handler None -> backward compat (no gate)
|
|
- spec_manager None + handler set -> skip gate + warn
|
|
- Handler raises -> exception propagated
|
|
- SpecManager.park()/resume() round-trip; parked survives reload; confirm() works
|
|
- Whitelist assertion (silent no-op prevention)
|
|
- Unknown spec_review_id ignored (no crash)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
from pathlib import Path
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from agentkit.core.exceptions import TaskCancelledError
|
|
from agentkit.core.plan_exec_engine import PlanExecEngine, _MAX_SPEC_REVIEW_REPLANS
|
|
from agentkit.core.plan_executor import PlanExecutionResult, StepExecutionResult
|
|
from agentkit.core.plan_schema import ExecutionPlan, PlanStep, PlanStepStatus
|
|
from agentkit.core.protocol import CancellationToken, TaskStatus
|
|
from agentkit.core.react import ReActResult
|
|
from agentkit.core.spec_manager import Spec, SpecManager, SpecStep
|
|
|
|
|
|
# ── Helpers ──────────────────────────────────────────────
|
|
|
|
|
|
def make_plan(
|
|
goal: str = "test goal",
|
|
plan_id: str = "plan-1",
|
|
steps: list[PlanStep] | None = None,
|
|
) -> ExecutionPlan:
|
|
"""Construct an ExecutionPlan with a distinct plan_id."""
|
|
if steps is None:
|
|
steps = [
|
|
PlanStep(step_id="step-0", name="Step 0", description="First step"),
|
|
PlanStep(step_id="step-1", name="Step 1", description="Second step"),
|
|
]
|
|
plan = ExecutionPlan(goal=goal, steps=steps)
|
|
plan.plan_id = plan_id
|
|
plan.parallel_groups = [[s.step_id] for s in steps]
|
|
return plan
|
|
|
|
|
|
def make_step_result(
|
|
step_id: str,
|
|
status: PlanStepStatus = PlanStepStatus.COMPLETED,
|
|
result: dict | None = None,
|
|
) -> StepExecutionResult:
|
|
return StepExecutionResult(
|
|
step_id=step_id,
|
|
status=status,
|
|
result=result or {"content": f"result of {step_id}"},
|
|
error=None,
|
|
)
|
|
|
|
|
|
def make_plan_result(
|
|
plan_id: str = "plan-1",
|
|
status: TaskStatus = TaskStatus.COMPLETED,
|
|
) -> PlanExecutionResult:
|
|
step_results = {
|
|
"step-0": make_step_result("step-0"),
|
|
"step-1": make_step_result("step-1"),
|
|
}
|
|
return PlanExecutionResult(
|
|
plan_id=plan_id,
|
|
step_results=step_results,
|
|
status=status,
|
|
total_duration_ms=100.0,
|
|
)
|
|
|
|
|
|
def make_spec(spec_id: str = "plan-1", goal: str = "test goal") -> Spec:
|
|
return Spec(
|
|
spec_id=spec_id,
|
|
goal=goal,
|
|
steps=[SpecStep(step_id="s1", name="Step 1", description="First")],
|
|
)
|
|
|
|
|
|
def make_engine(
|
|
specs_dir: str,
|
|
*,
|
|
spec_review_handler=None,
|
|
spec_manager: SpecManager | None = None,
|
|
step_event_callback=None,
|
|
) -> tuple[PlanExecEngine, SpecManager]:
|
|
"""Build a PlanExecEngine wired with a SpecManager (tmp dir)."""
|
|
mgr = spec_manager if spec_manager is not None else SpecManager(specs_dir=specs_dir)
|
|
engine = PlanExecEngine(
|
|
llm_gateway=None,
|
|
spec_manager=mgr,
|
|
spec_review_handler=spec_review_handler,
|
|
step_event_callback=step_event_callback,
|
|
)
|
|
return engine, mgr
|
|
|
|
|
|
def patch_executor(plan_result: PlanExecutionResult):
|
|
"""Patch PlanExecutor so execute() returns the given plan_result."""
|
|
mock_executor = MagicMock()
|
|
mock_executor.execute = AsyncMock(return_value=plan_result)
|
|
return patch("agentkit.core.plan_exec_engine.PlanExecutor", return_value=mock_executor)
|
|
|
|
|
|
# ── Whitelist assertion ──────────────────────────────────
|
|
|
|
|
|
class TestWhitelist:
|
|
"""Prevent silent no-op regression (streaming-event-contract learning)."""
|
|
|
|
def test_spec_review_events_in_whitelist(self):
|
|
from agentkit.server.routes.chat import _VALID_TEAM_EVENT_TYPES
|
|
|
|
assert "spec_review_request" in _VALID_TEAM_EVENT_TYPES
|
|
assert "spec_review_reply" in _VALID_TEAM_EVENT_TYPES
|
|
|
|
|
|
# ── Happy path (AE4) ─────────────────────────────────────
|
|
|
|
|
|
class TestHappyPathStream:
|
|
"""PLAN_EXEC generates Spec -> spec_review_request -> suspend -> approve -> resume."""
|
|
|
|
async def test_approve_resumes_execution(self, tmp_path: Path):
|
|
seen_calls: list[tuple[str, str, list]] = []
|
|
|
|
async def handler(spec_id: str, goal: str, steps: list[dict]):
|
|
seen_calls.append((spec_id, goal, steps))
|
|
return ("approved", "")
|
|
|
|
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
|
|
plan = make_plan(plan_id="plan-1")
|
|
plan_result = make_plan_result()
|
|
|
|
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
|
|
with patch_executor(plan_result):
|
|
events = [
|
|
e
|
|
async for e in engine.execute_stream(
|
|
messages=[{"role": "user", "content": "do a complex task"}],
|
|
)
|
|
]
|
|
|
|
event_types = [e.event_type for e in events]
|
|
# Spec created, review request, review reply, then execution + final_answer
|
|
assert "spec_created" in event_types
|
|
assert "spec_review_request" in event_types
|
|
assert "spec_review_reply" in event_types
|
|
# request comes before reply (terminal-event symmetry / ordering)
|
|
assert event_types.index("spec_review_request") < event_types.index("spec_review_reply")
|
|
# Execution resumed after approval -> step events + final_answer
|
|
assert "final_answer" in event_types
|
|
final = next(e for e in events if e.event_type == "final_answer")
|
|
assert final.data["plan_status"] != "parked"
|
|
|
|
# Handler called with the spec_id matching the created spec, the goal,
|
|
# and a list of step dicts.
|
|
assert len(seen_calls) == 1
|
|
spec_id, goal, steps = seen_calls[0]
|
|
assert spec_id == "plan-1"
|
|
assert goal == "test goal"
|
|
assert isinstance(steps, list)
|
|
assert all("step_id" in s and "name" in s for s in steps)
|
|
|
|
async def test_nonstream_approve_returns_success(self, tmp_path: Path):
|
|
async def handler(spec_id, goal, steps):
|
|
return ("approved", "")
|
|
|
|
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
|
|
plan = make_plan(plan_id="plan-1")
|
|
plan_result = make_plan_result()
|
|
|
|
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
|
|
with patch_executor(plan_result):
|
|
result = await engine.execute(
|
|
messages=[{"role": "user", "content": "do a complex task"}],
|
|
)
|
|
|
|
assert isinstance(result, ReActResult)
|
|
assert result.status == "success"
|
|
assert result.output # aggregated output present
|
|
|
|
|
|
# ── Edge cases ───────────────────────────────────────────
|
|
|
|
|
|
class TestRejectionReplan:
|
|
"""User rejects -> replan with feedback -> new Spec -> review again."""
|
|
|
|
async def test_reject_then_approve_regenerates_spec(self, tmp_path: Path):
|
|
# First review rejects with feedback, second approves.
|
|
responses = [("rejected", "make it simpler"), ("approved", "")]
|
|
|
|
async def handler(spec_id, goal, steps):
|
|
return responses.pop(0)
|
|
|
|
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
|
|
plan1 = make_plan(plan_id="plan-1")
|
|
plan2 = make_plan(plan_id="plan-2", goal="test goal (simpler)")
|
|
plan_result = make_plan_result()
|
|
|
|
with patch.object(
|
|
engine._planner,
|
|
"generate_plan",
|
|
AsyncMock(side_effect=[plan1, plan2]),
|
|
):
|
|
with patch_executor(plan_result):
|
|
events = [
|
|
e
|
|
async for e in engine.execute_stream(
|
|
messages=[{"role": "user", "content": "do a complex task"}],
|
|
)
|
|
]
|
|
|
|
# Two spec_created events (plan-1 then plan-2 after replan), two
|
|
# review requests, two review replies.
|
|
spec_created = [e for e in events if e.event_type == "spec_created"]
|
|
requests = [e for e in events if e.event_type == "spec_review_request"]
|
|
replies = [e for e in events if e.event_type == "spec_review_reply"]
|
|
assert len(spec_created) == 2
|
|
assert len(requests) == 2
|
|
assert len(replies) == 2
|
|
# The second review targets a new spec_id (replan produced plan-2).
|
|
assert requests[0].data["spec_id"] == "plan-1"
|
|
assert requests[1].data["spec_id"] == "plan-2"
|
|
# First reply carries rejection + feedback; second carries approval.
|
|
assert replies[0].data["decision"] == "rejected"
|
|
assert replies[0].data["feedback"] == "make it simpler"
|
|
assert replies[1].data["decision"] == "approved"
|
|
# Execution resumed -> final_answer is success, not parked/failed.
|
|
final = next(e for e in events if e.event_type == "final_answer")
|
|
assert final.data["plan_status"] != "parked"
|
|
assert final.data["plan_status"] != "failed"
|
|
|
|
async def test_replan_cap_exhausted_fails(self, tmp_path: Path):
|
|
# Always reject: cap is 2 replans -> 3rd rejection exhausts the gate.
|
|
async def handler(spec_id, goal, steps):
|
|
return ("rejected", "still no good")
|
|
|
|
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
|
|
plans = [make_plan(plan_id=f"plan-{i}") for i in range(1, 6)]
|
|
plan_result = make_plan_result()
|
|
|
|
with patch.object(
|
|
engine._planner,
|
|
"generate_plan",
|
|
AsyncMock(side_effect=plans),
|
|
):
|
|
with patch_executor(plan_result):
|
|
events = [
|
|
e
|
|
async for e in engine.execute_stream(
|
|
messages=[{"role": "user", "content": "do a complex task"}],
|
|
)
|
|
]
|
|
|
|
requests = [e for e in events if e.event_type == "spec_review_request"]
|
|
replies = [e for e in events if e.event_type == "spec_review_reply"]
|
|
# 3 reviews (initial + 2 replans), all rejected, then exhausted.
|
|
assert len(requests) == _MAX_SPEC_REVIEW_REPLANS + 1
|
|
assert all(r.data["decision"] == "rejected" for r in replies)
|
|
final = next(e for e in events if e.event_type == "final_answer")
|
|
assert final.data["plan_status"] == "failed"
|
|
assert "replan cap" in final.data["output"]
|
|
|
|
|
|
class TestTimeoutParked:
|
|
"""Timeout (30min simulated) -> Spec parked (not failed)."""
|
|
|
|
async def test_stream_timeout_parks_spec(self, tmp_path: Path):
|
|
async def handler(spec_id, goal, steps):
|
|
raise asyncio.TimeoutError
|
|
|
|
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
|
|
plan = make_plan(plan_id="plan-1")
|
|
plan_result = make_plan_result()
|
|
|
|
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
|
|
with patch_executor(plan_result):
|
|
events = [
|
|
e
|
|
async for e in engine.execute_stream(
|
|
messages=[{"role": "user", "content": "do a complex task"}],
|
|
)
|
|
]
|
|
|
|
# Reply event carries decision=timeout + status=parked.
|
|
replies = [e for e in events if e.event_type == "spec_review_reply"]
|
|
assert len(replies) == 1
|
|
assert replies[0].data["decision"] == "timeout"
|
|
assert replies[0].data["status"] == "parked"
|
|
# final_answer surfaces parked (not failed).
|
|
final = next(e for e in events if e.event_type == "final_answer")
|
|
assert final.data["plan_status"] == "parked"
|
|
# Spec persisted as parked.
|
|
spec = mgr.get("plan-1")
|
|
assert spec is not None
|
|
assert spec.status == "parked"
|
|
|
|
async def test_nonstream_timeout_returns_parked_status(self, tmp_path: Path):
|
|
async def handler(spec_id, goal, steps):
|
|
raise asyncio.TimeoutError
|
|
|
|
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
|
|
plan = make_plan(plan_id="plan-1")
|
|
plan_result = make_plan_result()
|
|
|
|
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
|
|
with patch_executor(plan_result):
|
|
result = await engine.execute(
|
|
messages=[{"role": "user", "content": "do a complex task"}],
|
|
)
|
|
|
|
assert isinstance(result, ReActResult)
|
|
assert result.status == "parked"
|
|
assert mgr.get("plan-1").status == "parked"
|
|
|
|
|
|
class TestCancellation:
|
|
"""Stream cancelled mid-review -> CancelledError propagates, no deadlock."""
|
|
|
|
async def test_handler_cancelled_propagates(self, tmp_path: Path):
|
|
async def handler(spec_id, goal, steps):
|
|
raise asyncio.CancelledError
|
|
|
|
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
|
|
plan = make_plan(plan_id="plan-1")
|
|
plan_result = make_plan_result()
|
|
|
|
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
|
|
with patch_executor(plan_result):
|
|
with pytest.raises(asyncio.CancelledError):
|
|
async for _ in engine.execute_stream(
|
|
messages=[{"role": "user", "content": "do a complex task"}],
|
|
):
|
|
pass
|
|
|
|
async def test_token_cancelled_before_gate_raises_task_cancelled(self, tmp_path: Path):
|
|
async def handler(spec_id, goal, steps): # pragma: no cover - never reached
|
|
return ("approved", "")
|
|
|
|
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
|
|
token = CancellationToken()
|
|
token.cancel()
|
|
plan = make_plan(plan_id="plan-1")
|
|
plan_result = make_plan_result()
|
|
|
|
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
|
|
with patch_executor(plan_result):
|
|
with pytest.raises(TaskCancelledError):
|
|
async for _ in engine.execute_stream(
|
|
messages=[{"role": "user", "content": "do a complex task"}],
|
|
cancellation_token=token,
|
|
):
|
|
pass
|
|
|
|
|
|
class TestBackwardCompat:
|
|
"""spec_review_handler None -> no gate; spec_manager None + handler -> skip."""
|
|
|
|
async def test_handler_none_skips_gate(self, tmp_path: Path):
|
|
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=None)
|
|
plan = make_plan(plan_id="plan-1")
|
|
plan_result = make_plan_result()
|
|
|
|
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
|
|
with patch_executor(plan_result):
|
|
events = [
|
|
e
|
|
async for e in engine.execute_stream(
|
|
messages=[{"role": "user", "content": "do a complex task"}],
|
|
)
|
|
]
|
|
|
|
event_types = [e.event_type for e in events]
|
|
# Spec still created, but no review gate events.
|
|
assert "spec_created" in event_types
|
|
assert "spec_review_request" not in event_types
|
|
assert "spec_review_reply" not in event_types
|
|
assert "final_answer" in event_types
|
|
|
|
async def test_spec_manager_none_handler_set_skips_gate(self, tmp_path: Path):
|
|
# handler set but spec_manager None -> gate skipped with a warning,
|
|
# execution proceeds (no crash, no spec_review events).
|
|
async def handler(spec_id, goal, steps): # pragma: no cover - never reached
|
|
return ("approved", "")
|
|
|
|
engine = PlanExecEngine(llm_gateway=None, spec_manager=None, spec_review_handler=handler)
|
|
plan = make_plan(plan_id="plan-1")
|
|
plan_result = make_plan_result()
|
|
|
|
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
|
|
with patch_executor(plan_result):
|
|
events = [
|
|
e
|
|
async for e in engine.execute_stream(
|
|
messages=[{"role": "user", "content": "do a complex task"}],
|
|
)
|
|
]
|
|
|
|
event_types = [e.event_type for e in events]
|
|
assert "spec_created" not in event_types # no spec_manager -> no spec
|
|
assert "spec_review_request" not in event_types
|
|
assert "final_answer" in event_types
|
|
|
|
|
|
# ── Error / failure paths ────────────────────────────────
|
|
|
|
|
|
class TestHandlerRaises:
|
|
"""Handler raises a non-timeout/cancel exception -> propagated."""
|
|
|
|
async def test_handler_value_error_propagates(self, tmp_path: Path):
|
|
async def handler(spec_id, goal, steps):
|
|
raise ValueError("handler blew up")
|
|
|
|
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
|
|
plan = make_plan(plan_id="plan-1")
|
|
plan_result = make_plan_result()
|
|
|
|
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
|
|
with patch_executor(plan_result):
|
|
with pytest.raises(ValueError, match="handler blew up"):
|
|
async for _ in engine.execute_stream(
|
|
messages=[{"role": "user", "content": "do a complex task"}],
|
|
):
|
|
pass
|
|
|
|
|
|
class TestUnknownSpecReviewId:
|
|
"""An unknown spec_review_id is ignored (no crash) — mirrors the WS loop."""
|
|
|
|
def test_unknown_id_ignored(self):
|
|
# Replicates the chat.py WS-loop guard: only known ids resolve a future.
|
|
pending: dict[str, asyncio.Future] = {}
|
|
loop = asyncio.new_event_loop()
|
|
try:
|
|
fut: asyncio.Future = loop.create_future()
|
|
pending["known-id"] = fut
|
|
# An unknown id must not raise (the loop logs + ignores).
|
|
unknown = "does-not-exist"
|
|
assert unknown not in pending # the guard the loop uses
|
|
# Known id resolves fine.
|
|
assert "known-id" in pending
|
|
finally:
|
|
loop.close()
|
|
|
|
|
|
# ── SpecManager integration ──────────────────────────────
|
|
|
|
|
|
class TestSpecManagerParkResume:
|
|
"""park()/resume() round-trip; parked survives reload; confirm() works."""
|
|
|
|
def test_park_sets_status_parked(self, tmp_path: Path):
|
|
mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
|
|
mgr.create(make_spec(spec_id="s1"))
|
|
parked = mgr.park("s1")
|
|
assert parked is not None
|
|
assert parked.status == "parked"
|
|
|
|
def test_resume_sets_status_draft(self, tmp_path: Path):
|
|
mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
|
|
mgr.create(make_spec(spec_id="s1"))
|
|
mgr.park("s1")
|
|
resumed = mgr.resume("s1")
|
|
assert resumed is not None
|
|
assert resumed.status == "draft"
|
|
|
|
def test_resume_non_parked_is_noop(self, tmp_path: Path):
|
|
# ponytail: idempotent resume — no-op (returns spec unchanged) rather
|
|
# than raising on a double-resume.
|
|
mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
|
|
mgr.create(make_spec(spec_id="s1"))
|
|
# status is "draft", not "parked" -> resume is a no-op.
|
|
result = mgr.resume("s1")
|
|
assert result is not None
|
|
assert result.status == "draft"
|
|
|
|
def test_park_nonexistent_returns_none(self, tmp_path: Path):
|
|
mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
|
|
assert mgr.park("nope") is None
|
|
|
|
def test_resume_nonexistent_returns_none(self, tmp_path: Path):
|
|
mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
|
|
assert mgr.resume("nope") is None
|
|
|
|
def test_parked_survives_reload(self, tmp_path: Path):
|
|
# A fresh SpecManager instance loading from disk must see "parked".
|
|
specs_dir = str(tmp_path / "specs")
|
|
mgr1 = SpecManager(specs_dir=specs_dir)
|
|
mgr1.create(make_spec(spec_id="s1"))
|
|
mgr1.park("s1")
|
|
|
|
mgr2 = SpecManager(specs_dir=specs_dir)
|
|
loaded = mgr2.get("s1")
|
|
assert loaded is not None
|
|
assert loaded.status == "parked"
|
|
|
|
def test_confirm_still_works(self, tmp_path: Path):
|
|
# Backward compat: the existing confirm() REST endpoint path.
|
|
mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
|
|
mgr.create(make_spec(spec_id="s1"))
|
|
confirmed = mgr.confirm("s1")
|
|
assert confirmed is not None
|
|
assert confirmed.status == "confirmed"
|
|
assert confirmed.confirmed_at is not None
|