fischer-agentkit/tests/unit/test_spec_review_gate.py

"""Tests for U8: spec review gate (R8).

Covers:
- Happy path (AE4): PLAN_EXEC pauses for review, user approves, execution resumes
- Rejection -> replan -> re-review; replan cap (2) -> failure (not infinite loop)
- Timeout -> Spec parked (not failed); ReActResult status="parked"
- Stream cancelled mid-review -> CancelledError propagates, no deadlock
- spec_review_handler None -> backward compat (no gate)
- spec_manager None + handler set -> skip gate + warn
- Handler raises -> exception propagated
- SpecManager.park()/resume() round-trip; parked survives reload; confirm() works
- Whitelist assertion (silent no-op prevention)
- Unknown spec_review_id ignored (no crash)
"""

from __future__ import annotations

import asyncio
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch

import pytest

from agentkit.core.exceptions import TaskCancelledError
from agentkit.core.plan_exec_engine import PlanExecEngine, _MAX_SPEC_REVIEW_REPLANS
from agentkit.core.plan_executor import PlanExecutionResult, StepExecutionResult
from agentkit.core.plan_schema import ExecutionPlan, PlanStep, PlanStepStatus
from agentkit.core.protocol import CancellationToken, TaskStatus
from agentkit.core.react import ReActResult
from agentkit.core.spec_manager import Spec, SpecManager, SpecStep


# ── Helpers ──────────────────────────────────────────────


def make_plan(
    goal: str = "test goal",
    plan_id: str = "plan-1",
    steps: list[PlanStep] | None = None,
) -> ExecutionPlan:
    """Construct an ExecutionPlan with a distinct plan_id."""
    if steps is None:
        steps = [
            PlanStep(step_id="step-0", name="Step 0", description="First step"),
            PlanStep(step_id="step-1", name="Step 1", description="Second step"),
        ]
    plan = ExecutionPlan(goal=goal, steps=steps)
    plan.plan_id = plan_id
    plan.parallel_groups = [[s.step_id] for s in steps]
    return plan


def make_step_result(
    step_id: str,
    status: PlanStepStatus = PlanStepStatus.COMPLETED,
    result: dict | None = None,
) -> StepExecutionResult:
    return StepExecutionResult(
        step_id=step_id,
        status=status,
        result=result or {"content": f"result of {step_id}"},
        error=None,
    )


def make_plan_result(
    plan_id: str = "plan-1",
    status: TaskStatus = TaskStatus.COMPLETED,
) -> PlanExecutionResult:
    step_results = {
        "step-0": make_step_result("step-0"),
        "step-1": make_step_result("step-1"),
    }
    return PlanExecutionResult(
        plan_id=plan_id,
        step_results=step_results,
        status=status,
        total_duration_ms=100.0,
    )


def make_spec(spec_id: str = "plan-1", goal: str = "test goal") -> Spec:
    return Spec(
        spec_id=spec_id,
        goal=goal,
        steps=[SpecStep(step_id="s1", name="Step 1", description="First")],
    )


def make_engine(
    specs_dir: str,
    *,
    spec_review_handler=None,
    spec_manager: SpecManager | None = None,
    step_event_callback=None,
) -> tuple[PlanExecEngine, SpecManager]:
    """Build a PlanExecEngine wired with a SpecManager (tmp dir)."""
    mgr = spec_manager if spec_manager is not None else SpecManager(specs_dir=specs_dir)
    engine = PlanExecEngine(
        llm_gateway=None,
        spec_manager=mgr,
        spec_review_handler=spec_review_handler,
        step_event_callback=step_event_callback,
    )
    return engine, mgr


def patch_executor(plan_result: PlanExecutionResult):
    """Patch PlanExecutor so execute() returns the given plan_result."""
    mock_executor = MagicMock()
    mock_executor.execute = AsyncMock(return_value=plan_result)
    return patch("agentkit.core.plan_exec_engine.PlanExecutor", return_value=mock_executor)


# ── Whitelist assertion ──────────────────────────────────


class TestWhitelist:
    """Prevent silent no-op regression (streaming-event-contract learning)."""

    def test_spec_review_events_in_whitelist(self):
        from agentkit.server.routes.chat import _VALID_TEAM_EVENT_TYPES

        assert "spec_review_request" in _VALID_TEAM_EVENT_TYPES
        assert "spec_review_reply" in _VALID_TEAM_EVENT_TYPES


# ── Happy path (AE4) ─────────────────────────────────────


class TestHappyPathStream:
    """PLAN_EXEC generates Spec -> spec_review_request -> suspend -> approve -> resume."""

    async def test_approve_resumes_execution(self, tmp_path: Path):
        seen_calls: list[tuple[str, str, list]] = []

        async def handler(spec_id: str, goal: str, steps: list[dict]):
            seen_calls.append((spec_id, goal, steps))
            return ("approved", "")

        engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
        plan = make_plan(plan_id="plan-1")
        plan_result = make_plan_result()

        with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
            with patch_executor(plan_result):
                events = [
                    e
                    async for e in engine.execute_stream(
                        messages=[{"role": "user", "content": "do a complex task"}],
                    )
                ]

        event_types = [e.event_type for e in events]
        # Spec created, review request, review reply, then execution + final_answer
        assert "spec_created" in event_types
        assert "spec_review_request" in event_types
        assert "spec_review_reply" in event_types
        # request comes before reply (terminal-event symmetry / ordering)
        assert event_types.index("spec_review_request") < event_types.index("spec_review_reply")
        # Execution resumed after approval -> step events + final_answer
        assert "final_answer" in event_types
        final = next(e for e in events if e.event_type == "final_answer")
        assert final.data["plan_status"] != "parked"

        # Handler called with the spec_id matching the created spec, the goal,
        # and a list of step dicts.
        assert len(seen_calls) == 1
        spec_id, goal, steps = seen_calls[0]
        assert spec_id == "plan-1"
        assert goal == "test goal"
        assert isinstance(steps, list)
        assert all("step_id" in s and "name" in s for s in steps)

    async def test_nonstream_approve_returns_success(self, tmp_path: Path):
        async def handler(spec_id, goal, steps):
            return ("approved", "")

        engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
        plan = make_plan(plan_id="plan-1")
        plan_result = make_plan_result()

        with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
            with patch_executor(plan_result):
                result = await engine.execute(
                    messages=[{"role": "user", "content": "do a complex task"}],
                )

        assert isinstance(result, ReActResult)
        assert result.status == "success"
        assert result.output  # aggregated output present


# ── Edge cases ───────────────────────────────────────────


class TestRejectionReplan:
    """User rejects -> replan with feedback -> new Spec -> review again."""

    async def test_reject_then_approve_regenerates_spec(self, tmp_path: Path):
        # First review rejects with feedback, second approves.
        responses = [("rejected", "make it simpler"), ("approved", "")]

        async def handler(spec_id, goal, steps):
            return responses.pop(0)

        engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
        plan1 = make_plan(plan_id="plan-1")
        plan2 = make_plan(plan_id="plan-2", goal="test goal (simpler)")
        plan_result = make_plan_result()

        with patch.object(
            engine._planner,
            "generate_plan",
            AsyncMock(side_effect=[plan1, plan2]),
        ):
            with patch_executor(plan_result):
                events = [
                    e
                    async for e in engine.execute_stream(
                        messages=[{"role": "user", "content": "do a complex task"}],
                    )
                ]

        # Two spec_created events (plan-1 then plan-2 after replan), two
        # review requests, two review replies.
        spec_created = [e for e in events if e.event_type == "spec_created"]
        requests = [e for e in events if e.event_type == "spec_review_request"]
        replies = [e for e in events if e.event_type == "spec_review_reply"]
        assert len(spec_created) == 2
        assert len(requests) == 2
        assert len(replies) == 2
        # The second review targets a new spec_id (replan produced plan-2).
        assert requests[0].data["spec_id"] == "plan-1"
        assert requests[1].data["spec_id"] == "plan-2"
        # First reply carries rejection + feedback; second carries approval.
        assert replies[0].data["decision"] == "rejected"
        assert replies[0].data["feedback"] == "make it simpler"
        assert replies[1].data["decision"] == "approved"
        # Execution resumed -> final_answer is success, not parked/failed.
        final = next(e for e in events if e.event_type == "final_answer")
        assert final.data["plan_status"] != "parked"
        assert final.data["plan_status"] != "failed"

    async def test_replan_cap_exhausted_fails(self, tmp_path: Path):
        # Always reject: cap is 2 replans -> 3rd rejection exhausts the gate.
        async def handler(spec_id, goal, steps):
            return ("rejected", "still no good")

        engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
        plans = [make_plan(plan_id=f"plan-{i}") for i in range(1, 6)]
        plan_result = make_plan_result()

        with patch.object(
            engine._planner,
            "generate_plan",
            AsyncMock(side_effect=plans),
        ):
            with patch_executor(plan_result):
                events = [
                    e
                    async for e in engine.execute_stream(
                        messages=[{"role": "user", "content": "do a complex task"}],
                    )
                ]

        requests = [e for e in events if e.event_type == "spec_review_request"]
        replies = [e for e in events if e.event_type == "spec_review_reply"]
        # 3 reviews (initial + 2 replans), all rejected, then exhausted.
        assert len(requests) == _MAX_SPEC_REVIEW_REPLANS + 1
        assert all(r.data["decision"] == "rejected" for r in replies)
        final = next(e for e in events if e.event_type == "final_answer")
        assert final.data["plan_status"] == "failed"
        assert "replan cap" in final.data["output"]


class TestTimeoutParked:
    """Timeout (30min simulated) -> Spec parked (not failed)."""

    async def test_stream_timeout_parks_spec(self, tmp_path: Path):
        async def handler(spec_id, goal, steps):
            raise asyncio.TimeoutError

        engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
        plan = make_plan(plan_id="plan-1")
        plan_result = make_plan_result()

        with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
            with patch_executor(plan_result):
                events = [
                    e
                    async for e in engine.execute_stream(
                        messages=[{"role": "user", "content": "do a complex task"}],
                    )
                ]

        # Reply event carries decision=timeout + status=parked.
        replies = [e for e in events if e.event_type == "spec_review_reply"]
        assert len(replies) == 1
        assert replies[0].data["decision"] == "timeout"
        assert replies[0].data["status"] == "parked"
        # final_answer surfaces parked (not failed).
        final = next(e for e in events if e.event_type == "final_answer")
        assert final.data["plan_status"] == "parked"
        # Spec persisted as parked.
        spec = mgr.get("plan-1")
        assert spec is not None
        assert spec.status == "parked"

    async def test_nonstream_timeout_returns_parked_status(self, tmp_path: Path):
        async def handler(spec_id, goal, steps):
            raise asyncio.TimeoutError

        engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
        plan = make_plan(plan_id="plan-1")
        plan_result = make_plan_result()

        with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
            with patch_executor(plan_result):
                result = await engine.execute(
                    messages=[{"role": "user", "content": "do a complex task"}],
                )

        assert isinstance(result, ReActResult)
        assert result.status == "parked"
        assert mgr.get("plan-1").status == "parked"


class TestCancellation:
    """Stream cancelled mid-review -> CancelledError propagates, no deadlock."""

    async def test_handler_cancelled_propagates(self, tmp_path: Path):
        async def handler(spec_id, goal, steps):
            raise asyncio.CancelledError

        engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
        plan = make_plan(plan_id="plan-1")
        plan_result = make_plan_result()

        with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
            with patch_executor(plan_result):
                with pytest.raises(asyncio.CancelledError):
                    async for _ in engine.execute_stream(
                        messages=[{"role": "user", "content": "do a complex task"}],
                    ):
                        pass

    async def test_token_cancelled_before_gate_raises_task_cancelled(self, tmp_path: Path):
        async def handler(spec_id, goal, steps):  # pragma: no cover - never reached
            return ("approved", "")

        engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
        token = CancellationToken()
        token.cancel()
        plan = make_plan(plan_id="plan-1")
        plan_result = make_plan_result()

        with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
            with patch_executor(plan_result):
                with pytest.raises(TaskCancelledError):
                    async for _ in engine.execute_stream(
                        messages=[{"role": "user", "content": "do a complex task"}],
                        cancellation_token=token,
                    ):
                        pass


class TestBackwardCompat:
    """spec_review_handler None -> no gate; spec_manager None + handler -> skip."""

    async def test_handler_none_skips_gate(self, tmp_path: Path):
        engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=None)
        plan = make_plan(plan_id="plan-1")
        plan_result = make_plan_result()

        with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
            with patch_executor(plan_result):
                events = [
                    e
                    async for e in engine.execute_stream(
                        messages=[{"role": "user", "content": "do a complex task"}],
                    )
                ]

        event_types = [e.event_type for e in events]
        # Spec still created, but no review gate events.
        assert "spec_created" in event_types
        assert "spec_review_request" not in event_types
        assert "spec_review_reply" not in event_types
        assert "final_answer" in event_types

    async def test_spec_manager_none_handler_set_skips_gate(self, tmp_path: Path):
        # handler set but spec_manager None -> gate skipped with a warning,
        # execution proceeds (no crash, no spec_review events).
        async def handler(spec_id, goal, steps):  # pragma: no cover - never reached
            return ("approved", "")

        engine = PlanExecEngine(llm_gateway=None, spec_manager=None, spec_review_handler=handler)
        plan = make_plan(plan_id="plan-1")
        plan_result = make_plan_result()

        with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
            with patch_executor(plan_result):
                events = [
                    e
                    async for e in engine.execute_stream(
                        messages=[{"role": "user", "content": "do a complex task"}],
                    )
                ]

        event_types = [e.event_type for e in events]
        assert "spec_created" not in event_types  # no spec_manager -> no spec
        assert "spec_review_request" not in event_types
        assert "final_answer" in event_types


# ── Error / failure paths ────────────────────────────────


class TestHandlerRaises:
    """Handler raises a non-timeout/cancel exception -> propagated."""

    async def test_handler_value_error_propagates(self, tmp_path: Path):
        async def handler(spec_id, goal, steps):
            raise ValueError("handler blew up")

        engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
        plan = make_plan(plan_id="plan-1")
        plan_result = make_plan_result()

        with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
            with patch_executor(plan_result):
                with pytest.raises(ValueError, match="handler blew up"):
                    async for _ in engine.execute_stream(
                        messages=[{"role": "user", "content": "do a complex task"}],
                    ):
                        pass


class TestUnknownSpecReviewId:
    """An unknown spec_review_id is ignored (no crash) — mirrors the WS loop."""

    def test_unknown_id_ignored(self):
        # Replicates the chat.py WS-loop guard: only known ids resolve a future.
        pending: dict[str, asyncio.Future] = {}
        loop = asyncio.new_event_loop()
        try:
            fut: asyncio.Future = loop.create_future()
            pending["known-id"] = fut
            # An unknown id must not raise (the loop logs + ignores).
            unknown = "does-not-exist"
            assert unknown not in pending  # the guard the loop uses
            # Known id resolves fine.
            assert "known-id" in pending
        finally:
            loop.close()


# ── SpecManager integration ──────────────────────────────


class TestSpecManagerParkResume:
    """park()/resume() round-trip; parked survives reload; confirm() works."""

    def test_park_sets_status_parked(self, tmp_path: Path):
        mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
        mgr.create(make_spec(spec_id="s1"))
        parked = mgr.park("s1")
        assert parked is not None
        assert parked.status == "parked"

    def test_resume_sets_status_draft(self, tmp_path: Path):
        mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
        mgr.create(make_spec(spec_id="s1"))
        mgr.park("s1")
        resumed = mgr.resume("s1")
        assert resumed is not None
        assert resumed.status == "draft"

    def test_resume_non_parked_is_noop(self, tmp_path: Path):
        # ponytail: idempotent resume — no-op (returns spec unchanged) rather
        # than raising on a double-resume.
        mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
        mgr.create(make_spec(spec_id="s1"))
        # status is "draft", not "parked" -> resume is a no-op.
        result = mgr.resume("s1")
        assert result is not None
        assert result.status == "draft"

    def test_park_nonexistent_returns_none(self, tmp_path: Path):
        mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
        assert mgr.park("nope") is None

    def test_resume_nonexistent_returns_none(self, tmp_path: Path):
        mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
        assert mgr.resume("nope") is None

    def test_parked_survives_reload(self, tmp_path: Path):
        # A fresh SpecManager instance loading from disk must see "parked".
        specs_dir = str(tmp_path / "specs")
        mgr1 = SpecManager(specs_dir=specs_dir)
        mgr1.create(make_spec(spec_id="s1"))
        mgr1.park("s1")

        mgr2 = SpecManager(specs_dir=specs_dir)
        loaded = mgr2.get("s1")
        assert loaded is not None
        assert loaded.status == "parked"

    def test_confirm_still_works(self, tmp_path: Path):
        # Backward compat: the existing confirm() REST endpoint path.
        mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
        mgr.create(make_spec(spec_id="s1"))
        confirmed = mgr.confirm("s1")
        assert confirmed is not None
        assert confirmed.status == "confirmed"
        assert confirmed.confirmed_at is not None