"""U4/G1: Verify 失败回灌 ReAct 测试 characterization-first: 先覆盖现有 verify 行为(max_reinjections=0 等价于不回灌), 再测新回灌行为(reinjection on first fail, break on second fail)。 R1: verify 首次失败 → errors 注入 conversation → LLM 自纠正 → 二次 verify 通过 R2: verify 二次失败 → 中断返回错误附 verify log R3: max_reinjections 可配置(默认 1),=0 等价于不回灌;回灌受 max_steps 约束 R13: ServerConfig.verification 配置项 R14: max_reinjections 默认值为 1 """ from __future__ import annotations from unittest.mock import AsyncMock, MagicMock, patch from agentkit.core.react import ReActEngine from agentkit.core.verification_loop import VerificationResult from agentkit.llm.gateway import LLMGateway from agentkit.llm.protocol import LLMResponse, TokenUsage # ── Helpers ────────────────────────────────────────────── def make_mock_gateway(responses: list[LLMResponse]) -> LLMGateway: """创建按顺序返回给定响应的 mock LLMGateway。""" gateway = MagicMock(spec=LLMGateway) gateway.chat = AsyncMock(side_effect=responses) gateway.get_provider_name_for_model = MagicMock(return_value=None) return gateway def make_response(content: str = "") -> LLMResponse: return LLMResponse( content=content, model="test-model", usage=TokenUsage(prompt_tokens=10, completion_tokens=20), tool_calls=[], ) def make_verify_result(passed: bool, errors: list[str] | None = None) -> VerificationResult: return VerificationResult( passed=passed, attempts=1, test_output="$ pytest\nFAILED test_x.py" if not passed else "$ pytest\nOK", errors=errors or ([] if passed else ["test_x.py::test_failed"]), ) def make_mock_vloop(verify_results: list[VerificationResult]) -> MagicMock: """创建一个 mock VerificationLoop,verify() 按顺序返回给定结果。""" vloop = MagicMock() vloop.verify = AsyncMock(side_effect=verify_results) return vloop # ── Characterization: max_reinjections=0 等价于当前行为 ────────── class TestVerifyCharacterization: """现有 verify 行为(max_reinjections=0):失败仅记录 trajectory,不回灌。""" async def test_verify_disabled_no_verify_step(self): """verification_enabled=False → 不运行 verify,trajectory 无 verification step。""" gateway = make_mock_gateway([make_response("final answer")]) engine = ReActEngine(llm_gateway=gateway, max_steps=3) result = await engine.execute( messages=[{"role": "user", "content": "do something"}], ) assert result.output == "final answer" assert all(s.tool_name != "verification" for s in result.trajectory) async def test_verify_pass_no_extra_step(self): """verify 通过 → 不追加 verification step。""" gateway = make_mock_gateway([make_response("answer")]) engine = ReActEngine( llm_gateway=gateway, max_steps=3, verification_enabled=True, verification_commands=["echo ok"], max_reinjections=0, ) with patch( "agentkit.core.verification_loop.VerificationLoop", return_value=make_mock_vloop([make_verify_result(passed=True)]), ): result = await engine.execute( messages=[{"role": "user", "content": "do something"}], ) assert result.output == "answer" assert all(s.tool_name != "verification" for s in result.trajectory) async def test_verify_fail_max_zero_no_reinjection(self): """max_reinjections=0 + verify 失败 → 仅记录 trajectory,不回灌 LLM。 这是当前行为的 characterization:gateway.chat 只被调用一次。 """ gateway = make_mock_gateway([make_response("bad answer")]) engine = ReActEngine( llm_gateway=gateway, max_steps=3, verification_enabled=True, verification_commands=["false"], max_reinjections=0, ) with patch( "agentkit.core.verification_loop.VerificationLoop", return_value=make_mock_vloop([make_verify_result(passed=False)]), ): result = await engine.execute( messages=[{"role": "user", "content": "do something"}], ) # LLM 只被调用一次(无回灌) assert gateway.chat.await_count == 1 # 输出仍保留 assert result.output == "bad answer" # trajectory 包含 verification step verify_steps = [s for s in result.trajectory if s.tool_name == "verification"] assert len(verify_steps) == 1 assert verify_steps[0].result["passed"] is False # ── R1: 回灌后 LLM 自纠正 → 二次 verify 通过 ────────── class TestVerifyReinjection: """verify 失败回灌 conversation,LLM 自纠正后二次 verify 通过。""" async def test_first_fail_reinject_second_pass(self): """R1: verify 首次失败 → errors 注入 conversation → LLM 修正 → 二次 verify 通过。""" gateway = make_mock_gateway( [ make_response("bad code"), # 第一次:错误答案 make_response("fixed code"), # 第二次:修正后答案 ] ) engine = ReActEngine( llm_gateway=gateway, max_steps=5, verification_enabled=True, verification_commands=["pytest"], max_reinjections=1, # 默认值,允许 1 次回灌 ) with patch( "agentkit.core.verification_loop.VerificationLoop", return_value=make_mock_vloop( [ make_verify_result(passed=False), # 第一次 verify 失败 make_verify_result(passed=True), # 第二次 verify 通过 ] ), ): result = await engine.execute( messages=[{"role": "user", "content": "write code"}], ) # LLM 被调用两次 assert gateway.chat.await_count == 2 # 最终输出是修正后的答案 assert result.output == "fixed code" # 二次 verify 通过,不追加 verification step verify_steps = [s for s in result.trajectory if s.tool_name == "verification"] assert len(verify_steps) == 0 async def test_reinjected_user_message_appears_in_conversation(self): """R1 集成:回灌的 user 消息出现在 conversation,含 errors 文本。""" gateway = make_mock_gateway( [ make_response("bad"), make_response("good"), ] ) engine = ReActEngine( llm_gateway=gateway, max_steps=5, verification_enabled=True, verification_commands=["pytest"], max_reinjections=1, ) with patch( "agentkit.core.verification_loop.VerificationLoop", return_value=make_mock_vloop( [ make_verify_result(passed=False, errors=["AssertionError: x != y"]), make_verify_result(passed=True), ] ), ): await engine.execute( messages=[{"role": "user", "content": "write code"}], ) # 第二次 LLM 调用时,conversation 应包含回灌的 user 消息 second_call = gateway.chat.await_args_list[1] msgs_sent = second_call.kwargs.get("messages") or second_call[1].get("messages") reinjected = [ m for m in msgs_sent if m.get("role") == "user" and "验证失败" in m.get("content", "") ] assert len(reinjected) >= 1 assert "AssertionError: x != y" in reinjected[-1]["content"] # ── R2: 二次 verify 失败 → 中断返回错误 ────────── class TestVerifyDoubleFailure: """verify 二次失败 → 中断,返回错误附 verify log。""" async def test_second_fail_breaks_with_verify_log(self): """R2: 二次 verify 失败 → 中断,trajectory 含 verify log + errors。""" gateway = make_mock_gateway( [ make_response("bad v1"), make_response("bad v2"), ] ) engine = ReActEngine( llm_gateway=gateway, max_steps=5, verification_enabled=True, verification_commands=["pytest"], max_reinjections=1, ) with patch( "agentkit.core.verification_loop.VerificationLoop", return_value=make_mock_vloop( [ make_verify_result(passed=False, errors=["err1"]), make_verify_result(passed=False, errors=["err2"]), ] ), ): result = await engine.execute( messages=[{"role": "user", "content": "write code"}], ) # LLM 被调用两次(initial + 1 reinjection) assert gateway.chat.await_count == 2 # 状态标记 verify 失败 assert result.status == "verify_failed" # 输出保留(LLM 最后的答案) assert result.output == "bad v2" # trajectory 含 verification step with errors verify_steps = [s for s in result.trajectory if s.tool_name == "verification"] assert len(verify_steps) == 1 assert verify_steps[0].result["passed"] is False assert "err2" in verify_steps[0].result["errors"] # ── R3: 配置 + 边界 ────────── class TestVerifyReinjectionConfig: """max_reinjections 配置测试。""" def test_default_max_reinjections_is_one(self): """R14 self-check: max_reinjections 默认值为 1。""" gateway = make_mock_gateway([]) engine = ReActEngine(llm_gateway=gateway) assert engine._max_reinjections == 1 async def test_max_reinjections_zero_skips_reinjection(self): """R3: max_reinjections=0 → 等价于不回灌(当前行为)。""" gateway = make_mock_gateway([make_response("only answer")]) engine = ReActEngine( llm_gateway=gateway, max_steps=5, verification_enabled=True, verification_commands=["false"], max_reinjections=0, ) with patch( "agentkit.core.verification_loop.VerificationLoop", return_value=make_mock_vloop([make_verify_result(passed=False)]), ): result = await engine.execute( messages=[{"role": "user", "content": "do something"}], ) assert gateway.chat.await_count == 1 # 无回灌 assert result.output == "only answer" async def test_reinjection_hits_max_steps_interrupts(self): """R3 edge: 回灌期间达到 max_steps → 中断(不无限循环)。""" # max_steps=2, max_reinjections=5(max_steps 先到) # LLM 调用 1:final answer → verify 失败 → reinject # LLM 调用 2:final answer → verify 失败 → step=2=max_steps → 中断 gateway = make_mock_gateway( [ make_response("ans1"), make_response("ans2"), ] ) engine = ReActEngine( llm_gateway=gateway, max_steps=2, verification_enabled=True, verification_commands=["false"], max_reinjections=5, # 远大于 max_steps,验证 max_steps 优先 ) with patch( "agentkit.core.verification_loop.VerificationLoop", return_value=make_mock_vloop( [ make_verify_result(passed=False), make_verify_result(passed=False), ] ), ): result = await engine.execute( messages=[{"role": "user", "content": "do something"}], ) # LLM 被调用 2 次(受 max_steps=2 限制) assert gateway.chat.await_count == 2 # 第二次 verify 失败,因 max_reinjections=5 未到但 max_steps 到了 # → 应中断(verify_failed 或 partial) assert result.status in ("verify_failed", "partial") # ── execute_stream 回灌 ────────── class TestVerifyReinjectionStream: """execute_stream 模式下的回灌行为。""" async def test_stream_reinjection_first_fail_second_pass(self): """R1 stream: verify 首次失败 → 回灌 → 二次通过。""" from agentkit.llm.protocol import StreamChunk def make_stream_chunks(content: str): """返回一个返回 chunk 列表的 async generator factory。""" async def _stream(**kwargs): # Simulate streaming: yield content in 2 chunks mid = len(content) // 2 yield StreamChunk(content=content[:mid], model="test-model") yield StreamChunk(content=content[mid:], model="test-model") return _stream gateway = MagicMock(spec=LLMGateway) gateway.chat_stream = MagicMock( side_effect=[ make_stream_chunks("bad code")(), make_stream_chunks("fixed code")(), ] ) gateway.get_provider_name_for_model = MagicMock(return_value=None) engine = ReActEngine( llm_gateway=gateway, max_steps=5, verification_enabled=True, verification_commands=["pytest"], max_reinjections=1, ) with patch( "agentkit.core.verification_loop.VerificationLoop", return_value=make_mock_vloop( [ make_verify_result(passed=False), make_verify_result(passed=True), ] ), ): events = [] async for event in engine.execute_stream( messages=[{"role": "user", "content": "write code"}], ): events.append(event) # chat_stream 被调用两次 assert gateway.chat_stream.call_count == 2 # 有 final_answer 事件 final_events = [e for e in events if e.event_type == "final_answer"] assert len(final_events) >= 1 # 最终输出是修正后的答案 assert "fixed code" in final_events[-1].data.get("output", "")