"""Tests for U8: spec review gate (R8). Covers: - Happy path (AE4): PLAN_EXEC pauses for review, user approves, execution resumes - Rejection -> replan -> re-review; replan cap (2) -> failure (not infinite loop) - Timeout -> Spec parked (not failed); ReActResult status="parked" - Stream cancelled mid-review -> CancelledError propagates, no deadlock - spec_review_handler None -> backward compat (no gate) - spec_manager None + handler set -> skip gate + warn - Handler raises -> exception propagated - SpecManager.park()/resume() round-trip; parked survives reload; confirm() works - Whitelist assertion (silent no-op prevention) - Unknown spec_review_id ignored (no crash) """ from __future__ import annotations import asyncio from pathlib import Path from unittest.mock import AsyncMock, MagicMock, patch import pytest from agentkit.core.exceptions import TaskCancelledError from agentkit.core.plan_exec_engine import PlanExecEngine, _MAX_SPEC_REVIEW_REPLANS from agentkit.core.plan_executor import PlanExecutionResult, StepExecutionResult from agentkit.core.plan_schema import ExecutionPlan, PlanStep, PlanStepStatus from agentkit.core.protocol import CancellationToken, TaskStatus from agentkit.core.react import ReActResult from agentkit.core.spec_manager import Spec, SpecManager, SpecStep # ── Helpers ────────────────────────────────────────────── def make_plan( goal: str = "test goal", plan_id: str = "plan-1", steps: list[PlanStep] | None = None, ) -> ExecutionPlan: """Construct an ExecutionPlan with a distinct plan_id.""" if steps is None: steps = [ PlanStep(step_id="step-0", name="Step 0", description="First step"), PlanStep(step_id="step-1", name="Step 1", description="Second step"), ] plan = ExecutionPlan(goal=goal, steps=steps) plan.plan_id = plan_id plan.parallel_groups = [[s.step_id] for s in steps] return plan def make_step_result( step_id: str, status: PlanStepStatus = PlanStepStatus.COMPLETED, result: dict | None = None, ) -> StepExecutionResult: return StepExecutionResult( step_id=step_id, status=status, result=result or {"content": f"result of {step_id}"}, error=None, ) def make_plan_result( plan_id: str = "plan-1", status: TaskStatus = TaskStatus.COMPLETED, ) -> PlanExecutionResult: step_results = { "step-0": make_step_result("step-0"), "step-1": make_step_result("step-1"), } return PlanExecutionResult( plan_id=plan_id, step_results=step_results, status=status, total_duration_ms=100.0, ) def make_spec(spec_id: str = "plan-1", goal: str = "test goal") -> Spec: return Spec( spec_id=spec_id, goal=goal, steps=[SpecStep(step_id="s1", name="Step 1", description="First")], ) def make_engine( specs_dir: str, *, spec_review_handler=None, spec_manager: SpecManager | None = None, step_event_callback=None, ) -> tuple[PlanExecEngine, SpecManager]: """Build a PlanExecEngine wired with a SpecManager (tmp dir).""" mgr = spec_manager if spec_manager is not None else SpecManager(specs_dir=specs_dir) engine = PlanExecEngine( llm_gateway=None, spec_manager=mgr, spec_review_handler=spec_review_handler, step_event_callback=step_event_callback, ) return engine, mgr def patch_executor(plan_result: PlanExecutionResult): """Patch PlanExecutor so execute() returns the given plan_result.""" mock_executor = MagicMock() mock_executor.execute = AsyncMock(return_value=plan_result) return patch("agentkit.core.plan_exec_engine.PlanExecutor", return_value=mock_executor) # ── Whitelist assertion ────────────────────────────────── class TestWhitelist: """Prevent silent no-op regression (streaming-event-contract learning).""" def test_spec_review_events_in_whitelist(self): from agentkit.server.routes.chat import _VALID_TEAM_EVENT_TYPES assert "spec_review_request" in _VALID_TEAM_EVENT_TYPES assert "spec_review_reply" in _VALID_TEAM_EVENT_TYPES # ── Happy path (AE4) ───────────────────────────────────── class TestHappyPathStream: """PLAN_EXEC generates Spec -> spec_review_request -> suspend -> approve -> resume.""" async def test_approve_resumes_execution(self, tmp_path: Path): seen_calls: list[tuple[str, str, list]] = [] async def handler(spec_id: str, goal: str, steps: list[dict]): seen_calls.append((spec_id, goal, steps)) return ("approved", "") engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler) plan = make_plan(plan_id="plan-1") plan_result = make_plan_result() with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)): with patch_executor(plan_result): events = [ e async for e in engine.execute_stream( messages=[{"role": "user", "content": "do a complex task"}], ) ] event_types = [e.event_type for e in events] # Spec created, review request, review reply, then execution + final_answer assert "spec_created" in event_types assert "spec_review_request" in event_types assert "spec_review_reply" in event_types # request comes before reply (terminal-event symmetry / ordering) assert event_types.index("spec_review_request") < event_types.index("spec_review_reply") # Execution resumed after approval -> step events + final_answer assert "final_answer" in event_types final = next(e for e in events if e.event_type == "final_answer") assert final.data["plan_status"] != "parked" # Handler called with the spec_id matching the created spec, the goal, # and a list of step dicts. assert len(seen_calls) == 1 spec_id, goal, steps = seen_calls[0] assert spec_id == "plan-1" assert goal == "test goal" assert isinstance(steps, list) assert all("step_id" in s and "name" in s for s in steps) async def test_nonstream_approve_returns_success(self, tmp_path: Path): async def handler(spec_id, goal, steps): return ("approved", "") engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler) plan = make_plan(plan_id="plan-1") plan_result = make_plan_result() with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)): with patch_executor(plan_result): result = await engine.execute( messages=[{"role": "user", "content": "do a complex task"}], ) assert isinstance(result, ReActResult) assert result.status == "success" assert result.output # aggregated output present # ── Edge cases ─────────────────────────────────────────── class TestRejectionReplan: """User rejects -> replan with feedback -> new Spec -> review again.""" async def test_reject_then_approve_regenerates_spec(self, tmp_path: Path): # First review rejects with feedback, second approves. responses = [("rejected", "make it simpler"), ("approved", "")] async def handler(spec_id, goal, steps): return responses.pop(0) engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler) plan1 = make_plan(plan_id="plan-1") plan2 = make_plan(plan_id="plan-2", goal="test goal (simpler)") plan_result = make_plan_result() with patch.object( engine._planner, "generate_plan", AsyncMock(side_effect=[plan1, plan2]), ): with patch_executor(plan_result): events = [ e async for e in engine.execute_stream( messages=[{"role": "user", "content": "do a complex task"}], ) ] # Two spec_created events (plan-1 then plan-2 after replan), two # review requests, two review replies. spec_created = [e for e in events if e.event_type == "spec_created"] requests = [e for e in events if e.event_type == "spec_review_request"] replies = [e for e in events if e.event_type == "spec_review_reply"] assert len(spec_created) == 2 assert len(requests) == 2 assert len(replies) == 2 # The second review targets a new spec_id (replan produced plan-2). assert requests[0].data["spec_id"] == "plan-1" assert requests[1].data["spec_id"] == "plan-2" # First reply carries rejection + feedback; second carries approval. assert replies[0].data["decision"] == "rejected" assert replies[0].data["feedback"] == "make it simpler" assert replies[1].data["decision"] == "approved" # Execution resumed -> final_answer is success, not parked/failed. final = next(e for e in events if e.event_type == "final_answer") assert final.data["plan_status"] != "parked" assert final.data["plan_status"] != "failed" async def test_replan_cap_exhausted_fails(self, tmp_path: Path): # Always reject: cap is 2 replans -> 3rd rejection exhausts the gate. async def handler(spec_id, goal, steps): return ("rejected", "still no good") engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler) plans = [make_plan(plan_id=f"plan-{i}") for i in range(1, 6)] plan_result = make_plan_result() with patch.object( engine._planner, "generate_plan", AsyncMock(side_effect=plans), ): with patch_executor(plan_result): events = [ e async for e in engine.execute_stream( messages=[{"role": "user", "content": "do a complex task"}], ) ] requests = [e for e in events if e.event_type == "spec_review_request"] replies = [e for e in events if e.event_type == "spec_review_reply"] # 3 reviews (initial + 2 replans), all rejected, then exhausted. assert len(requests) == _MAX_SPEC_REVIEW_REPLANS + 1 assert all(r.data["decision"] == "rejected" for r in replies) final = next(e for e in events if e.event_type == "final_answer") assert final.data["plan_status"] == "failed" assert "replan cap" in final.data["output"] class TestTimeoutParked: """Timeout (30min simulated) -> Spec parked (not failed).""" async def test_stream_timeout_parks_spec(self, tmp_path: Path): async def handler(spec_id, goal, steps): raise asyncio.TimeoutError engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler) plan = make_plan(plan_id="plan-1") plan_result = make_plan_result() with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)): with patch_executor(plan_result): events = [ e async for e in engine.execute_stream( messages=[{"role": "user", "content": "do a complex task"}], ) ] # Reply event carries decision=timeout + status=parked. replies = [e for e in events if e.event_type == "spec_review_reply"] assert len(replies) == 1 assert replies[0].data["decision"] == "timeout" assert replies[0].data["status"] == "parked" # final_answer surfaces parked (not failed). final = next(e for e in events if e.event_type == "final_answer") assert final.data["plan_status"] == "parked" # Spec persisted as parked. spec = mgr.get("plan-1") assert spec is not None assert spec.status == "parked" async def test_nonstream_timeout_returns_parked_status(self, tmp_path: Path): async def handler(spec_id, goal, steps): raise asyncio.TimeoutError engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler) plan = make_plan(plan_id="plan-1") plan_result = make_plan_result() with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)): with patch_executor(plan_result): result = await engine.execute( messages=[{"role": "user", "content": "do a complex task"}], ) assert isinstance(result, ReActResult) assert result.status == "parked" assert mgr.get("plan-1").status == "parked" class TestCancellation: """Stream cancelled mid-review -> CancelledError propagates, no deadlock.""" async def test_handler_cancelled_propagates(self, tmp_path: Path): async def handler(spec_id, goal, steps): raise asyncio.CancelledError engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler) plan = make_plan(plan_id="plan-1") plan_result = make_plan_result() with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)): with patch_executor(plan_result): with pytest.raises(asyncio.CancelledError): async for _ in engine.execute_stream( messages=[{"role": "user", "content": "do a complex task"}], ): pass async def test_token_cancelled_before_gate_raises_task_cancelled(self, tmp_path: Path): async def handler(spec_id, goal, steps): # pragma: no cover - never reached return ("approved", "") engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler) token = CancellationToken() token.cancel() plan = make_plan(plan_id="plan-1") plan_result = make_plan_result() with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)): with patch_executor(plan_result): with pytest.raises(TaskCancelledError): async for _ in engine.execute_stream( messages=[{"role": "user", "content": "do a complex task"}], cancellation_token=token, ): pass class TestBackwardCompat: """spec_review_handler None -> no gate; spec_manager None + handler -> skip.""" async def test_handler_none_skips_gate(self, tmp_path: Path): engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=None) plan = make_plan(plan_id="plan-1") plan_result = make_plan_result() with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)): with patch_executor(plan_result): events = [ e async for e in engine.execute_stream( messages=[{"role": "user", "content": "do a complex task"}], ) ] event_types = [e.event_type for e in events] # Spec still created, but no review gate events. assert "spec_created" in event_types assert "spec_review_request" not in event_types assert "spec_review_reply" not in event_types assert "final_answer" in event_types async def test_spec_manager_none_handler_set_skips_gate(self, tmp_path: Path): # handler set but spec_manager None -> gate skipped with a warning, # execution proceeds (no crash, no spec_review events). async def handler(spec_id, goal, steps): # pragma: no cover - never reached return ("approved", "") engine = PlanExecEngine(llm_gateway=None, spec_manager=None, spec_review_handler=handler) plan = make_plan(plan_id="plan-1") plan_result = make_plan_result() with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)): with patch_executor(plan_result): events = [ e async for e in engine.execute_stream( messages=[{"role": "user", "content": "do a complex task"}], ) ] event_types = [e.event_type for e in events] assert "spec_created" not in event_types # no spec_manager -> no spec assert "spec_review_request" not in event_types assert "final_answer" in event_types # ── Error / failure paths ──────────────────────────────── class TestHandlerRaises: """Handler raises a non-timeout/cancel exception -> propagated.""" async def test_handler_value_error_propagates(self, tmp_path: Path): async def handler(spec_id, goal, steps): raise ValueError("handler blew up") engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler) plan = make_plan(plan_id="plan-1") plan_result = make_plan_result() with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)): with patch_executor(plan_result): with pytest.raises(ValueError, match="handler blew up"): async for _ in engine.execute_stream( messages=[{"role": "user", "content": "do a complex task"}], ): pass class TestUnknownSpecReviewId: """An unknown spec_review_id is ignored (no crash) — mirrors the WS loop.""" def test_unknown_id_ignored(self): # Replicates the chat.py WS-loop guard: only known ids resolve a future. pending: dict[str, asyncio.Future] = {} loop = asyncio.new_event_loop() try: fut: asyncio.Future = loop.create_future() pending["known-id"] = fut # An unknown id must not raise (the loop logs + ignores). unknown = "does-not-exist" assert unknown not in pending # the guard the loop uses # Known id resolves fine. assert "known-id" in pending finally: loop.close() # ── SpecManager integration ────────────────────────────── class TestSpecManagerParkResume: """park()/resume() round-trip; parked survives reload; confirm() works.""" def test_park_sets_status_parked(self, tmp_path: Path): mgr = SpecManager(specs_dir=str(tmp_path / "specs")) mgr.create(make_spec(spec_id="s1")) parked = mgr.park("s1") assert parked is not None assert parked.status == "parked" def test_resume_sets_status_draft(self, tmp_path: Path): mgr = SpecManager(specs_dir=str(tmp_path / "specs")) mgr.create(make_spec(spec_id="s1")) mgr.park("s1") resumed = mgr.resume("s1") assert resumed is not None assert resumed.status == "draft" def test_resume_non_parked_is_noop(self, tmp_path: Path): # ponytail: idempotent resume — no-op (returns spec unchanged) rather # than raising on a double-resume. mgr = SpecManager(specs_dir=str(tmp_path / "specs")) mgr.create(make_spec(spec_id="s1")) # status is "draft", not "parked" -> resume is a no-op. result = mgr.resume("s1") assert result is not None assert result.status == "draft" def test_park_nonexistent_returns_none(self, tmp_path: Path): mgr = SpecManager(specs_dir=str(tmp_path / "specs")) assert mgr.park("nope") is None def test_resume_nonexistent_returns_none(self, tmp_path: Path): mgr = SpecManager(specs_dir=str(tmp_path / "specs")) assert mgr.resume("nope") is None def test_parked_survives_reload(self, tmp_path: Path): # A fresh SpecManager instance loading from disk must see "parked". specs_dir = str(tmp_path / "specs") mgr1 = SpecManager(specs_dir=specs_dir) mgr1.create(make_spec(spec_id="s1")) mgr1.park("s1") mgr2 = SpecManager(specs_dir=specs_dir) loaded = mgr2.get("s1") assert loaded is not None assert loaded.status == "parked" def test_confirm_still_works(self, tmp_path: Path): # Backward compat: the existing confirm() REST endpoint path. mgr = SpecManager(specs_dir=str(tmp_path / "specs")) mgr.create(make_spec(spec_id="s1")) confirmed = mgr.confirm("s1") assert confirmed is not None assert confirmed.status == "confirmed" assert confirmed.confirmed_at is not None