fischer-agentkit/tests/unit/test_spec_review_gate.py

518 lines
21 KiB
Python

"""Tests for U8: spec review gate (R8).
Covers:
- Happy path (AE4): PLAN_EXEC pauses for review, user approves, execution resumes
- Rejection -> replan -> re-review; replan cap (2) -> failure (not infinite loop)
- Timeout -> Spec parked (not failed); ReActResult status="parked"
- Stream cancelled mid-review -> CancelledError propagates, no deadlock
- spec_review_handler None -> backward compat (no gate)
- spec_manager None + handler set -> skip gate + warn
- Handler raises -> exception propagated
- SpecManager.park()/resume() round-trip; parked survives reload; confirm() works
- Whitelist assertion (silent no-op prevention)
- Unknown spec_review_id ignored (no crash)
"""
from __future__ import annotations
import asyncio
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from agentkit.core.exceptions import TaskCancelledError
from agentkit.core.plan_exec_engine import PlanExecEngine, _MAX_SPEC_REVIEW_REPLANS
from agentkit.core.plan_executor import PlanExecutionResult, StepExecutionResult
from agentkit.core.plan_schema import ExecutionPlan, PlanStep, PlanStepStatus
from agentkit.core.protocol import CancellationToken, TaskStatus
from agentkit.core.react import ReActResult
from agentkit.core.spec_manager import Spec, SpecManager, SpecStep
# ── Helpers ──────────────────────────────────────────────
def make_plan(
goal: str = "test goal",
plan_id: str = "plan-1",
steps: list[PlanStep] | None = None,
) -> ExecutionPlan:
"""Construct an ExecutionPlan with a distinct plan_id."""
if steps is None:
steps = [
PlanStep(step_id="step-0", name="Step 0", description="First step"),
PlanStep(step_id="step-1", name="Step 1", description="Second step"),
]
plan = ExecutionPlan(goal=goal, steps=steps)
plan.plan_id = plan_id
plan.parallel_groups = [[s.step_id] for s in steps]
return plan
def make_step_result(
step_id: str,
status: PlanStepStatus = PlanStepStatus.COMPLETED,
result: dict | None = None,
) -> StepExecutionResult:
return StepExecutionResult(
step_id=step_id,
status=status,
result=result or {"content": f"result of {step_id}"},
error=None,
)
def make_plan_result(
plan_id: str = "plan-1",
status: TaskStatus = TaskStatus.COMPLETED,
) -> PlanExecutionResult:
step_results = {
"step-0": make_step_result("step-0"),
"step-1": make_step_result("step-1"),
}
return PlanExecutionResult(
plan_id=plan_id,
step_results=step_results,
status=status,
total_duration_ms=100.0,
)
def make_spec(spec_id: str = "plan-1", goal: str = "test goal") -> Spec:
return Spec(
spec_id=spec_id,
goal=goal,
steps=[SpecStep(step_id="s1", name="Step 1", description="First")],
)
def make_engine(
specs_dir: str,
*,
spec_review_handler=None,
spec_manager: SpecManager | None = None,
step_event_callback=None,
) -> tuple[PlanExecEngine, SpecManager]:
"""Build a PlanExecEngine wired with a SpecManager (tmp dir)."""
mgr = spec_manager if spec_manager is not None else SpecManager(specs_dir=specs_dir)
engine = PlanExecEngine(
llm_gateway=None,
spec_manager=mgr,
spec_review_handler=spec_review_handler,
step_event_callback=step_event_callback,
)
return engine, mgr
def patch_executor(plan_result: PlanExecutionResult):
"""Patch PlanExecutor so execute() returns the given plan_result."""
mock_executor = MagicMock()
mock_executor.execute = AsyncMock(return_value=plan_result)
return patch("agentkit.core.plan_exec_engine.PlanExecutor", return_value=mock_executor)
# ── Whitelist assertion ──────────────────────────────────
class TestWhitelist:
"""Prevent silent no-op regression (streaming-event-contract learning)."""
def test_spec_review_events_in_whitelist(self):
from agentkit.server.routes.chat import _VALID_TEAM_EVENT_TYPES
assert "spec_review_request" in _VALID_TEAM_EVENT_TYPES
assert "spec_review_reply" in _VALID_TEAM_EVENT_TYPES
# ── Happy path (AE4) ─────────────────────────────────────
class TestHappyPathStream:
"""PLAN_EXEC generates Spec -> spec_review_request -> suspend -> approve -> resume."""
async def test_approve_resumes_execution(self, tmp_path: Path):
seen_calls: list[tuple[str, str, list]] = []
async def handler(spec_id: str, goal: str, steps: list[dict]):
seen_calls.append((spec_id, goal, steps))
return ("approved", "")
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
plan = make_plan(plan_id="plan-1")
plan_result = make_plan_result()
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
with patch_executor(plan_result):
events = [
e
async for e in engine.execute_stream(
messages=[{"role": "user", "content": "do a complex task"}],
)
]
event_types = [e.event_type for e in events]
# Spec created, review request, review reply, then execution + final_answer
assert "spec_created" in event_types
assert "spec_review_request" in event_types
assert "spec_review_reply" in event_types
# request comes before reply (terminal-event symmetry / ordering)
assert event_types.index("spec_review_request") < event_types.index("spec_review_reply")
# Execution resumed after approval -> step events + final_answer
assert "final_answer" in event_types
final = next(e for e in events if e.event_type == "final_answer")
assert final.data["plan_status"] != "parked"
# Handler called with the spec_id matching the created spec, the goal,
# and a list of step dicts.
assert len(seen_calls) == 1
spec_id, goal, steps = seen_calls[0]
assert spec_id == "plan-1"
assert goal == "test goal"
assert isinstance(steps, list)
assert all("step_id" in s and "name" in s for s in steps)
async def test_nonstream_approve_returns_success(self, tmp_path: Path):
async def handler(spec_id, goal, steps):
return ("approved", "")
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
plan = make_plan(plan_id="plan-1")
plan_result = make_plan_result()
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
with patch_executor(plan_result):
result = await engine.execute(
messages=[{"role": "user", "content": "do a complex task"}],
)
assert isinstance(result, ReActResult)
assert result.status == "success"
assert result.output # aggregated output present
# ── Edge cases ───────────────────────────────────────────
class TestRejectionReplan:
"""User rejects -> replan with feedback -> new Spec -> review again."""
async def test_reject_then_approve_regenerates_spec(self, tmp_path: Path):
# First review rejects with feedback, second approves.
responses = [("rejected", "make it simpler"), ("approved", "")]
async def handler(spec_id, goal, steps):
return responses.pop(0)
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
plan1 = make_plan(plan_id="plan-1")
plan2 = make_plan(plan_id="plan-2", goal="test goal (simpler)")
plan_result = make_plan_result()
with patch.object(
engine._planner,
"generate_plan",
AsyncMock(side_effect=[plan1, plan2]),
):
with patch_executor(plan_result):
events = [
e
async for e in engine.execute_stream(
messages=[{"role": "user", "content": "do a complex task"}],
)
]
# Two spec_created events (plan-1 then plan-2 after replan), two
# review requests, two review replies.
spec_created = [e for e in events if e.event_type == "spec_created"]
requests = [e for e in events if e.event_type == "spec_review_request"]
replies = [e for e in events if e.event_type == "spec_review_reply"]
assert len(spec_created) == 2
assert len(requests) == 2
assert len(replies) == 2
# The second review targets a new spec_id (replan produced plan-2).
assert requests[0].data["spec_id"] == "plan-1"
assert requests[1].data["spec_id"] == "plan-2"
# First reply carries rejection + feedback; second carries approval.
assert replies[0].data["decision"] == "rejected"
assert replies[0].data["feedback"] == "make it simpler"
assert replies[1].data["decision"] == "approved"
# Execution resumed -> final_answer is success, not parked/failed.
final = next(e for e in events if e.event_type == "final_answer")
assert final.data["plan_status"] != "parked"
assert final.data["plan_status"] != "failed"
async def test_replan_cap_exhausted_fails(self, tmp_path: Path):
# Always reject: cap is 2 replans -> 3rd rejection exhausts the gate.
async def handler(spec_id, goal, steps):
return ("rejected", "still no good")
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
plans = [make_plan(plan_id=f"plan-{i}") for i in range(1, 6)]
plan_result = make_plan_result()
with patch.object(
engine._planner,
"generate_plan",
AsyncMock(side_effect=plans),
):
with patch_executor(plan_result):
events = [
e
async for e in engine.execute_stream(
messages=[{"role": "user", "content": "do a complex task"}],
)
]
requests = [e for e in events if e.event_type == "spec_review_request"]
replies = [e for e in events if e.event_type == "spec_review_reply"]
# 3 reviews (initial + 2 replans), all rejected, then exhausted.
assert len(requests) == _MAX_SPEC_REVIEW_REPLANS + 1
assert all(r.data["decision"] == "rejected" for r in replies)
final = next(e for e in events if e.event_type == "final_answer")
assert final.data["plan_status"] == "failed"
assert "replan cap" in final.data["output"]
class TestTimeoutParked:
"""Timeout (30min simulated) -> Spec parked (not failed)."""
async def test_stream_timeout_parks_spec(self, tmp_path: Path):
async def handler(spec_id, goal, steps):
raise asyncio.TimeoutError
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
plan = make_plan(plan_id="plan-1")
plan_result = make_plan_result()
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
with patch_executor(plan_result):
events = [
e
async for e in engine.execute_stream(
messages=[{"role": "user", "content": "do a complex task"}],
)
]
# Reply event carries decision=timeout + status=parked.
replies = [e for e in events if e.event_type == "spec_review_reply"]
assert len(replies) == 1
assert replies[0].data["decision"] == "timeout"
assert replies[0].data["status"] == "parked"
# final_answer surfaces parked (not failed).
final = next(e for e in events if e.event_type == "final_answer")
assert final.data["plan_status"] == "parked"
# Spec persisted as parked.
spec = mgr.get("plan-1")
assert spec is not None
assert spec.status == "parked"
async def test_nonstream_timeout_returns_parked_status(self, tmp_path: Path):
async def handler(spec_id, goal, steps):
raise asyncio.TimeoutError
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
plan = make_plan(plan_id="plan-1")
plan_result = make_plan_result()
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
with patch_executor(plan_result):
result = await engine.execute(
messages=[{"role": "user", "content": "do a complex task"}],
)
assert isinstance(result, ReActResult)
assert result.status == "parked"
assert mgr.get("plan-1").status == "parked"
class TestCancellation:
"""Stream cancelled mid-review -> CancelledError propagates, no deadlock."""
async def test_handler_cancelled_propagates(self, tmp_path: Path):
async def handler(spec_id, goal, steps):
raise asyncio.CancelledError
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
plan = make_plan(plan_id="plan-1")
plan_result = make_plan_result()
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
with patch_executor(plan_result):
with pytest.raises(asyncio.CancelledError):
async for _ in engine.execute_stream(
messages=[{"role": "user", "content": "do a complex task"}],
):
pass
async def test_token_cancelled_before_gate_raises_task_cancelled(self, tmp_path: Path):
async def handler(spec_id, goal, steps): # pragma: no cover - never reached
return ("approved", "")
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
token = CancellationToken()
token.cancel()
plan = make_plan(plan_id="plan-1")
plan_result = make_plan_result()
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
with patch_executor(plan_result):
with pytest.raises(TaskCancelledError):
async for _ in engine.execute_stream(
messages=[{"role": "user", "content": "do a complex task"}],
cancellation_token=token,
):
pass
class TestBackwardCompat:
"""spec_review_handler None -> no gate; spec_manager None + handler -> skip."""
async def test_handler_none_skips_gate(self, tmp_path: Path):
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=None)
plan = make_plan(plan_id="plan-1")
plan_result = make_plan_result()
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
with patch_executor(plan_result):
events = [
e
async for e in engine.execute_stream(
messages=[{"role": "user", "content": "do a complex task"}],
)
]
event_types = [e.event_type for e in events]
# Spec still created, but no review gate events.
assert "spec_created" in event_types
assert "spec_review_request" not in event_types
assert "spec_review_reply" not in event_types
assert "final_answer" in event_types
async def test_spec_manager_none_handler_set_skips_gate(self, tmp_path: Path):
# handler set but spec_manager None -> gate skipped with a warning,
# execution proceeds (no crash, no spec_review events).
async def handler(spec_id, goal, steps): # pragma: no cover - never reached
return ("approved", "")
engine = PlanExecEngine(llm_gateway=None, spec_manager=None, spec_review_handler=handler)
plan = make_plan(plan_id="plan-1")
plan_result = make_plan_result()
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
with patch_executor(plan_result):
events = [
e
async for e in engine.execute_stream(
messages=[{"role": "user", "content": "do a complex task"}],
)
]
event_types = [e.event_type for e in events]
assert "spec_created" not in event_types # no spec_manager -> no spec
assert "spec_review_request" not in event_types
assert "final_answer" in event_types
# ── Error / failure paths ────────────────────────────────
class TestHandlerRaises:
"""Handler raises a non-timeout/cancel exception -> propagated."""
async def test_handler_value_error_propagates(self, tmp_path: Path):
async def handler(spec_id, goal, steps):
raise ValueError("handler blew up")
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
plan = make_plan(plan_id="plan-1")
plan_result = make_plan_result()
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
with patch_executor(plan_result):
with pytest.raises(ValueError, match="handler blew up"):
async for _ in engine.execute_stream(
messages=[{"role": "user", "content": "do a complex task"}],
):
pass
class TestUnknownSpecReviewId:
"""An unknown spec_review_id is ignored (no crash) — mirrors the WS loop."""
def test_unknown_id_ignored(self):
# Replicates the chat.py WS-loop guard: only known ids resolve a future.
pending: dict[str, asyncio.Future] = {}
loop = asyncio.new_event_loop()
try:
fut: asyncio.Future = loop.create_future()
pending["known-id"] = fut
# An unknown id must not raise (the loop logs + ignores).
unknown = "does-not-exist"
assert unknown not in pending # the guard the loop uses
# Known id resolves fine.
assert "known-id" in pending
finally:
loop.close()
# ── SpecManager integration ──────────────────────────────
class TestSpecManagerParkResume:
"""park()/resume() round-trip; parked survives reload; confirm() works."""
def test_park_sets_status_parked(self, tmp_path: Path):
mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
mgr.create(make_spec(spec_id="s1"))
parked = mgr.park("s1")
assert parked is not None
assert parked.status == "parked"
def test_resume_sets_status_draft(self, tmp_path: Path):
mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
mgr.create(make_spec(spec_id="s1"))
mgr.park("s1")
resumed = mgr.resume("s1")
assert resumed is not None
assert resumed.status == "draft"
def test_resume_non_parked_is_noop(self, tmp_path: Path):
# ponytail: idempotent resume — no-op (returns spec unchanged) rather
# than raising on a double-resume.
mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
mgr.create(make_spec(spec_id="s1"))
# status is "draft", not "parked" -> resume is a no-op.
result = mgr.resume("s1")
assert result is not None
assert result.status == "draft"
def test_park_nonexistent_returns_none(self, tmp_path: Path):
mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
assert mgr.park("nope") is None
def test_resume_nonexistent_returns_none(self, tmp_path: Path):
mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
assert mgr.resume("nope") is None
def test_parked_survives_reload(self, tmp_path: Path):
# A fresh SpecManager instance loading from disk must see "parked".
specs_dir = str(tmp_path / "specs")
mgr1 = SpecManager(specs_dir=specs_dir)
mgr1.create(make_spec(spec_id="s1"))
mgr1.park("s1")
mgr2 = SpecManager(specs_dir=specs_dir)
loaded = mgr2.get("s1")
assert loaded is not None
assert loaded.status == "parked"
def test_confirm_still_works(self, tmp_path: Path):
# Backward compat: the existing confirm() REST endpoint path.
mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
mgr.create(make_spec(spec_id="s1"))
confirmed = mgr.confirm("s1")
assert confirmed is not None
assert confirmed.status == "confirmed"
assert confirmed.confirmed_at is not None