"""Unit tests for U4: step budget phases + keep working bias (R11/R10). Covers: - ReActEngine.phase_budgets configuration (R11) - Loop detector threshold 3 with budgets vs 2 without (R10/RV22) - _reset_loop_detector preserves budget counters (KTD-9) - restore_budget_state checkpoint reconstruction (KTD-7) - PhasePolicy.step_budget field + serialization - PlanExecEngine threads phase_budgets through to ReActEngine - _force_advance_to_verification behavior - Integration: think quota forces phase advance - Integration: verify quota exhausted returns best result - Integration: reflect quota overrides max_reinjections - Backward compat: no phase_budgets = unchanged behavior """ from __future__ import annotations from unittest.mock import AsyncMock, MagicMock from agentkit.core.phase import WILDCARD, PhasePolicy, PhaseState from agentkit.core.plan_exec_engine import PlanExecEngine, ReActStepExecutor from agentkit.core.react import ReActEngine from agentkit.llm.gateway import LLMGateway from agentkit.llm.protocol import LLMResponse, TokenUsage, ToolCall from agentkit.tools.base import Tool # ── helpers ─────────────────────────────────────────────────────────── def make_mock_gateway(responses: list[LLMResponse] | None = None) -> MagicMock: """Mock LLMGateway. If responses given, chat returns them in order.""" gateway = MagicMock(spec=LLMGateway) if responses is not None: gateway.chat = AsyncMock(side_effect=responses) else: gateway.chat = AsyncMock(return_value=MagicMock()) return gateway def make_response( content: str = "", tool_calls: list[ToolCall] | None = None, prompt_tokens: int = 10, completion_tokens: int = 20, ) -> LLMResponse: return LLMResponse( content=content, model="test-model", usage=TokenUsage(prompt_tokens=prompt_tokens, completion_tokens=completion_tokens), tool_calls=tool_calls or [], ) class _FakeTool(Tool): """Minimal tool for integration tests.""" def __init__(self, name: str = "search", result: dict | None = None) -> None: super().__init__(name=name, description="fake tool") self._result = result or {"status": "ok"} async def execute(self, **kwargs) -> dict: return self._result def _wildcard_policy(start: PhaseState = PhaseState.PLANNING) -> PhasePolicy: """PhasePolicy allowing all tools in all phases.""" return PhasePolicy( whitelist={ PhaseState.PLANNING: frozenset({WILDCARD}), PhaseState.BUILDING: frozenset({WILDCARD}), PhaseState.VERIFICATION: frozenset({WILDCARD}), PhaseState.DELIVERY: frozenset({WILDCARD}), }, start_phase=start, ) # ── Configuration tests (R11) ───────────────────────────────────────── class TestPhaseBudgetsConfig: def test_phase_budgets_stored(self) -> None: engine = ReActEngine( llm_gateway=make_mock_gateway(), phase_budgets={"think": 7, "verify": 2, "reflect": 1}, ) assert engine._phase_budgets == {"think": 7, "verify": 2, "reflect": 1} def test_phase_budgets_default_none(self) -> None: engine = ReActEngine(llm_gateway=make_mock_gateway()) assert engine._phase_budgets is None def test_loop_threshold_raised_to_3_with_budgets(self) -> None: engine = ReActEngine( llm_gateway=make_mock_gateway(), phase_budgets={"think": 1}, ) assert engine._loop_threshold == 3 def test_loop_threshold_default_2_without_budgets(self) -> None: engine = ReActEngine(llm_gateway=make_mock_gateway()) assert engine._loop_threshold == 2 def test_max_reinjections_overridden_by_reflect_budget(self) -> None: engine = ReActEngine( llm_gateway=make_mock_gateway(), max_reinjections=5, phase_budgets={"reflect": 2}, ) assert engine._max_reinjections == 2 def test_max_reinjections_unchanged_without_reflect_budget(self) -> None: engine = ReActEngine( llm_gateway=make_mock_gateway(), max_reinjections=3, phase_budgets={"think": 5}, ) assert engine._max_reinjections == 3 def test_budget_counters_init_zero(self) -> None: engine = ReActEngine( llm_gateway=make_mock_gateway(), phase_budgets={"think": 1}, ) assert engine._think_count == 0 assert engine._verify_count == 0 assert engine._reflect_count == 0 # ── _reset_loop_detector (KTD-9) ────────────────────────────────────── class TestResetLoopDetector: def test_clears_loop_window(self) -> None: engine = ReActEngine(llm_gateway=make_mock_gateway()) engine._loop_window.append("hash1") engine._loop_window.append("hash2") engine._loop_corrected = True engine._reset_loop_detector() assert len(engine._loop_window) == 0 assert engine._loop_corrected is False def test_preserves_budget_counters(self) -> None: """KTD-9: _reset_loop_detector must NOT reset budget counters.""" engine = ReActEngine( llm_gateway=make_mock_gateway(), phase_budgets={"think": 5}, ) engine._think_count = 3 engine._verify_count = 1 engine._reflect_count = 2 engine._loop_window.append("hash1") engine._reset_loop_detector() assert engine._think_count == 3 assert engine._verify_count == 1 assert engine._reflect_count == 2 def test_preserves_phase_state(self) -> None: """KTD-9: _reset_loop_detector must NOT reset phase state.""" policy = _wildcard_policy() engine = ReActEngine( llm_gateway=make_mock_gateway(), phase_policy=policy, phase_budgets={"think": 5}, ) engine._current_phase = PhaseState.BUILDING engine._steps_in_phase = 4 engine._reset_loop_detector() assert engine._current_phase == PhaseState.BUILDING assert engine._steps_in_phase == 4 # ── restore_budget_state (KTD-7) ────────────────────────────────────── class TestRestoreBudgetState: def test_restores_counters(self) -> None: engine = ReActEngine( llm_gateway=make_mock_gateway(), phase_budgets={"think": 5}, ) engine.restore_budget_state(think=4, verify=2, reflect=1) assert engine._think_count == 4 assert engine._verify_count == 2 assert engine._reflect_count == 1 def test_restore_after_reset(self) -> None: """KTD-7: restore_budget_state called after reset() overrides zeros.""" engine = ReActEngine( llm_gateway=make_mock_gateway(), phase_budgets={"think": 5}, ) engine._think_count = 3 engine._verify_count = 1 engine._reflect_count = 1 engine.reset() assert engine._think_count == 0 engine.restore_budget_state(think=3, verify=1, reflect=1) assert engine._think_count == 3 assert engine._verify_count == 1 assert engine._reflect_count == 1 # ── reset() behavior ────────────────────────────────────────────────── class TestResetClearsBudgets: def test_reset_zeros_budget_counters(self) -> None: engine = ReActEngine( llm_gateway=make_mock_gateway(), phase_budgets={"think": 5}, ) engine._think_count = 7 engine._verify_count = 3 engine._reflect_count = 2 engine.reset() assert engine._think_count == 0 assert engine._verify_count == 0 assert engine._reflect_count == 0 def test_reset_clears_loop_detector(self) -> None: engine = ReActEngine(llm_gateway=make_mock_gateway()) engine._loop_window.append("hash1") engine._loop_corrected = True engine.reset() assert len(engine._loop_window) == 0 assert engine._loop_corrected is False # ── _check_tool_loop threshold (R10/RV22) ───────────────────────────── class TestCheckToolLoopThreshold: def test_threshold_3_with_budgets(self) -> None: """R10/RV22: loop threshold raised from 2 to 3 with phase_budgets.""" engine = ReActEngine( llm_gateway=make_mock_gateway(), phase_budgets={"think": 5}, ) assert engine._loop_threshold == 3 tc = [ToolCall(id="1", name="search", arguments={"q": "x"})] # 1st call: count=1 < 3 assert engine._check_tool_loop(tc) is None # 2nd call: count=2 < 3 assert engine._check_tool_loop(tc) is None # 3rd call: count=3 >= 3 assert engine._check_tool_loop(tc) == "search" def test_threshold_2_without_budgets(self) -> None: """Backward compat: threshold stays 2 without phase_budgets.""" engine = ReActEngine(llm_gateway=make_mock_gateway()) assert engine._loop_threshold == 2 tc = [ToolCall(id="1", name="search", arguments={"q": "x"})] # 1st call: count=1 < 2 assert engine._check_tool_loop(tc) is None # 2nd call: count=2 >= 2 assert engine._check_tool_loop(tc) == "search" # ── PhasePolicy.step_budget (KTD-7) ─────────────────────────────────── class TestPhasePolicyStepBudget: def test_step_budget_defaults_none(self) -> None: policy = PhasePolicy( whitelist={PhaseState.PLANNING: frozenset({WILDCARD})}, ) assert policy.step_budget is None def test_step_budget_set(self) -> None: policy = PhasePolicy( whitelist={PhaseState.PLANNING: frozenset({WILDCARD})}, step_budget=42, ) assert policy.step_budget == 42 def test_to_dict_includes_step_budget(self) -> None: policy = PhasePolicy( whitelist={PhaseState.PLANNING: frozenset({WILDCARD})}, step_budget=10, ) d = policy.to_dict() assert d["step_budget"] == 10 def test_to_dict_step_budget_none(self) -> None: policy = PhasePolicy( whitelist={PhaseState.PLANNING: frozenset({WILDCARD})}, ) d = policy.to_dict() assert d["step_budget"] is None # ── PlanExecEngine threading (R11) ──────────────────────────────────── class TestPlanExecEngineBudgets: def test_default_phase_budgets(self) -> None: engine = PlanExecEngine(llm_gateway=None) assert engine._phase_budgets == {"think": 7, "verify": 2, "reflect": 1} def test_custom_phase_budgets(self) -> None: custom = {"think": 10, "verify": 3, "reflect": 2} engine = PlanExecEngine(llm_gateway=None, phase_budgets=custom) assert engine._phase_budgets == custom # Ensure the module-level default wasn't mutated. assert engine._phase_budgets is not custom def test_executor_threads_budgets(self) -> None: executor = ReActStepExecutor( phase_budgets={"think": 5, "verify": 1, "reflect": 0}, ) assert executor._phase_budgets == {"think": 5, "verify": 1, "reflect": 0} def test_executor_defaults_none(self) -> None: executor = ReActStepExecutor() assert executor._phase_budgets is None # ── _force_advance_to_verification ──────────────────────────────────── class TestForceAdvanceToVerification: def test_advances_from_planning_to_verification(self) -> None: policy = _wildcard_policy(start=PhaseState.PLANNING) engine = ReActEngine( llm_gateway=make_mock_gateway(), phase_policy=policy, phase_budgets={"think": 1}, ) assert engine.current_phase == PhaseState.PLANNING engine._force_advance_to_verification() assert engine.current_phase == PhaseState.VERIFICATION def test_advances_from_building_to_verification(self) -> None: policy = _wildcard_policy(start=PhaseState.BUILDING) engine = ReActEngine( llm_gateway=make_mock_gateway(), phase_policy=policy, ) assert engine.current_phase == PhaseState.BUILDING engine._force_advance_to_verification() assert engine.current_phase == PhaseState.VERIFICATION def test_no_op_when_already_verification(self) -> None: policy = _wildcard_policy(start=PhaseState.VERIFICATION) engine = ReActEngine( llm_gateway=make_mock_gateway(), phase_policy=policy, ) engine._force_advance_to_verification() assert engine.current_phase == PhaseState.VERIFICATION def test_no_op_without_policy(self) -> None: engine = ReActEngine(llm_gateway=make_mock_gateway()) engine._force_advance_to_verification() assert engine.current_phase is None # ── Integration: think quota forces phase advance ───────────────────── class TestThinkQuotaIntegration: async def test_think_quota_forces_advance_to_verification(self) -> None: """R11: think quota exhausted forces advance to VERIFICATION.""" policy = _wildcard_policy(start=PhaseState.PLANNING) tool = _FakeTool(name="search", result={"found": True}) gateway = make_mock_gateway( [ make_response(tool_calls=[ToolCall(id="tc_1", name="search", arguments={})]), make_response(content="Done"), ] ) engine = ReActEngine( llm_gateway=gateway, phase_policy=policy, phase_budgets={"think": 1}, ) result = await engine.execute( messages=[{"role": "user", "content": "search and report"}], tools=[tool], ) # After 1 think step, phase should have advanced to VERIFICATION. assert engine.current_phase == PhaseState.VERIFICATION assert result.status == "success" assert result.output == "Done" async def test_think_quota_not_triggered_when_in_verification(self) -> None: """Think quota only counts PLANNING/BUILDING steps, not VERIFICATION.""" policy = _wildcard_policy(start=PhaseState.VERIFICATION) tool = _FakeTool(name="search", result={"found": True}) gateway = make_mock_gateway( [ make_response(tool_calls=[ToolCall(id="tc_1", name="search", arguments={})]), make_response(tool_calls=[ToolCall(id="tc_2", name="search", arguments={})]), make_response(content="Done"), ] ) engine = ReActEngine( llm_gateway=gateway, phase_policy=policy, phase_budgets={"think": 1}, ) await engine.execute( messages=[{"role": "user", "content": "verify stuff"}], tools=[tool], ) # Starting in VERIFICATION, think_count should stay 0. assert engine._think_count == 0 assert engine.current_phase == PhaseState.VERIFICATION # ── Integration: verify quota exhausted returns best result ──────────── class TestVerifyQuotaIntegration: async def test_verify_quota_exhausted_returns_best(self, monkeypatch) -> None: """R11: when verify quota exhausted, return best result without verify.""" from agentkit.core.verification_loop import VerificationResult class _FailVLoop: def __init__(self, **kwargs) -> None: pass async def verify(self) -> VerificationResult: return VerificationResult( passed=False, attempts=1, test_output="fail", errors=["err"] ) monkeypatch.setattr("agentkit.core.verification_loop.VerificationLoop", _FailVLoop) policy = _wildcard_policy(start=PhaseState.VERIFICATION) gateway = make_mock_gateway( [ make_response(content="answer 1"), make_response(content="answer 2"), ] ) engine = ReActEngine( llm_gateway=gateway, phase_policy=policy, verification_enabled=True, verification_commands=["pytest"], phase_budgets={"think": 5, "verify": 1, "reflect": 1}, ) result = await engine.execute( messages=[{"role": "user", "content": "do something"}], ) # First answer: verify_count=0 < 1, verify fails, reinject. # Second answer: verify_count=1 >= 1, skip verify, return best. assert result.output == "answer 2" assert engine._verify_count == 1 async def test_verify_quota_zero_skips_verification(self, monkeypatch) -> None: """R11: verify quota 0 means never verify.""" from agentkit.core.verification_loop import VerificationResult class _NeverCalledVLoop: def __init__(self, **kwargs) -> None: pass async def verify(self) -> VerificationResult: raise AssertionError("verify() should not be called with quota 0") monkeypatch.setattr("agentkit.core.verification_loop.VerificationLoop", _NeverCalledVLoop) policy = _wildcard_policy(start=PhaseState.VERIFICATION) gateway = make_mock_gateway( [ make_response(content="immediate answer"), ] ) engine = ReActEngine( llm_gateway=gateway, phase_policy=policy, verification_enabled=True, verification_commands=["pytest"], phase_budgets={"think": 5, "verify": 0, "reflect": 0}, ) result = await engine.execute( messages=[{"role": "user", "content": "quick task"}], ) assert result.output == "immediate answer" assert engine._verify_count == 0 # ── Integration: reflect quota (R10 keep-working bias) ───────────────── class TestReflectQuotaIntegration: async def test_reflect_quota_resets_loop_detector(self, monkeypatch) -> None: """R10/KTD-9: reflect reinjection resets loop detector between attempts.""" from agentkit.core.verification_loop import VerificationResult class _FailVLoop: def __init__(self, **kwargs) -> None: pass async def verify(self) -> VerificationResult: return VerificationResult( passed=False, attempts=1, test_output="fail", errors=["err"] ) monkeypatch.setattr("agentkit.core.verification_loop.VerificationLoop", _FailVLoop) policy = _wildcard_policy(start=PhaseState.VERIFICATION) gateway = make_mock_gateway( [ make_response(content="attempt 1"), make_response(content="attempt 2"), ] ) engine = ReActEngine( llm_gateway=gateway, phase_policy=policy, verification_enabled=True, verification_commands=["pytest"], phase_budgets={"think": 5, "verify": 3, "reflect": 1}, ) await engine.execute( messages=[{"role": "user", "content": "do something"}], ) # After reinjection, _reflect_count should be 1 and loop_window cleared. assert engine._reflect_count == 1 assert len(engine._loop_window) == 0 assert engine._loop_corrected is False async def test_reflect_quota_resets_think_count(self, monkeypatch) -> None: """R10: reflect reinjection resets think quota for next attempt.""" from agentkit.core.verification_loop import VerificationResult class _FailVLoop: def __init__(self, **kwargs) -> None: pass async def verify(self) -> VerificationResult: return VerificationResult( passed=False, attempts=1, test_output="fail", errors=["err"] ) monkeypatch.setattr("agentkit.core.verification_loop.VerificationLoop", _FailVLoop) policy = _wildcard_policy(start=PhaseState.VERIFICATION) gateway = make_mock_gateway( [ make_response(content="attempt 1"), make_response(content="attempt 2"), ] ) engine = ReActEngine( llm_gateway=gateway, phase_policy=policy, verification_enabled=True, verification_commands=["pytest"], phase_budgets={"think": 5, "verify": 3, "reflect": 1}, ) await engine.execute( messages=[{"role": "user", "content": "do something"}], ) # After reinjection, think_count should be reset to 0. assert engine._think_count == 0 async def test_reflect_quota_exhausted_breaks(self, monkeypatch) -> None: """R10: when reflect quota exhausted, verify fail breaks (not reinject).""" from agentkit.core.verification_loop import VerificationResult class _FailVLoop: def __init__(self, **kwargs) -> None: pass async def verify(self) -> VerificationResult: return VerificationResult( passed=False, attempts=1, test_output="fail", errors=["err"] ) monkeypatch.setattr("agentkit.core.verification_loop.VerificationLoop", _FailVLoop) policy = _wildcard_policy(start=PhaseState.VERIFICATION) gateway = make_mock_gateway( [ make_response(content="only attempt"), ] ) engine = ReActEngine( llm_gateway=gateway, phase_policy=policy, verification_enabled=True, verification_commands=["pytest"], phase_budgets={"think": 5, "verify": 3, "reflect": 0}, ) result = await engine.execute( messages=[{"role": "user", "content": "do something"}], ) # reflect=0 means max_reinjections=0, so verify fail breaks immediately. assert engine._reflect_count == 0 assert result.status == "verify_failed" # ── Backward compatibility ──────────────────────────────────────────── class TestBackwardCompat: async def test_no_budgets_unchanged_behavior(self) -> None: """Without phase_budgets, engine behaves identically to before U4.""" gateway = make_mock_gateway( [ make_response(content="hello"), ] ) engine = ReActEngine(llm_gateway=gateway) result = await engine.execute( messages=[{"role": "user", "content": "hi"}], ) assert result.output == "hello" assert result.status == "success" assert engine._loop_threshold == 2 assert engine._phase_budgets is None async def test_no_budgets_loop_threshold_2(self) -> None: """Without phase_budgets, loop detector still uses threshold 2.""" engine = ReActEngine(llm_gateway=make_mock_gateway()) assert engine._loop_threshold == 2 tc = [ToolCall(id="1", name="search", arguments={"q": "x"})] assert engine._check_tool_loop(tc) is None assert engine._check_tool_loop(tc) == "search" def test_max_reinjections_respected_without_budgets(self) -> None: engine = ReActEngine( llm_gateway=make_mock_gateway(), max_reinjections=3, ) assert engine._max_reinjections == 3