395 lines
14 KiB
Python
395 lines
14 KiB
Python
"""U4/G1: Verify 失败回灌 ReAct 测试
|
|
|
|
characterization-first: 先覆盖现有 verify 行为(max_reinjections=0 等价于不回灌),
|
|
再测新回灌行为(reinjection on first fail, break on second fail)。
|
|
|
|
R1: verify 首次失败 → errors 注入 conversation → LLM 自纠正 → 二次 verify 通过
|
|
R2: verify 二次失败 → 中断返回错误附 verify log
|
|
R3: max_reinjections 可配置(默认 1),=0 等价于不回灌;回灌受 max_steps 约束
|
|
R13: ServerConfig.verification 配置项
|
|
R14: max_reinjections 默认值为 1
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
|
|
from agentkit.core.react import ReActEngine
|
|
from agentkit.core.verification_loop import VerificationResult
|
|
from agentkit.llm.gateway import LLMGateway
|
|
from agentkit.llm.protocol import LLMResponse, TokenUsage
|
|
|
|
|
|
# ── Helpers ──────────────────────────────────────────────
|
|
|
|
|
|
def make_mock_gateway(responses: list[LLMResponse]) -> LLMGateway:
|
|
"""创建按顺序返回给定响应的 mock LLMGateway。"""
|
|
gateway = MagicMock(spec=LLMGateway)
|
|
gateway.chat = AsyncMock(side_effect=responses)
|
|
gateway.get_provider_name_for_model = MagicMock(return_value=None)
|
|
return gateway
|
|
|
|
|
|
def make_response(content: str = "") -> LLMResponse:
|
|
return LLMResponse(
|
|
content=content,
|
|
model="test-model",
|
|
usage=TokenUsage(prompt_tokens=10, completion_tokens=20),
|
|
tool_calls=[],
|
|
)
|
|
|
|
|
|
def make_verify_result(passed: bool, errors: list[str] | None = None) -> VerificationResult:
|
|
return VerificationResult(
|
|
passed=passed,
|
|
attempts=1,
|
|
test_output="$ pytest\nFAILED test_x.py" if not passed else "$ pytest\nOK",
|
|
errors=errors or ([] if passed else ["test_x.py::test_failed"]),
|
|
)
|
|
|
|
|
|
def make_mock_vloop(verify_results: list[VerificationResult]) -> MagicMock:
|
|
"""创建一个 mock VerificationLoop,verify() 按顺序返回给定结果。"""
|
|
vloop = MagicMock()
|
|
vloop.verify = AsyncMock(side_effect=verify_results)
|
|
return vloop
|
|
|
|
|
|
# ── Characterization: max_reinjections=0 等价于当前行为 ──────────
|
|
|
|
|
|
class TestVerifyCharacterization:
|
|
"""现有 verify 行为(max_reinjections=0):失败仅记录 trajectory,不回灌。"""
|
|
|
|
async def test_verify_disabled_no_verify_step(self):
|
|
"""verification_enabled=False → 不运行 verify,trajectory 无 verification step。"""
|
|
gateway = make_mock_gateway([make_response("final answer")])
|
|
engine = ReActEngine(llm_gateway=gateway, max_steps=3)
|
|
|
|
result = await engine.execute(
|
|
messages=[{"role": "user", "content": "do something"}],
|
|
)
|
|
|
|
assert result.output == "final answer"
|
|
assert all(s.tool_name != "verification" for s in result.trajectory)
|
|
|
|
async def test_verify_pass_no_extra_step(self):
|
|
"""verify 通过 → 不追加 verification step。"""
|
|
gateway = make_mock_gateway([make_response("answer")])
|
|
engine = ReActEngine(
|
|
llm_gateway=gateway,
|
|
max_steps=3,
|
|
verification_enabled=True,
|
|
verification_commands=["echo ok"],
|
|
max_reinjections=0,
|
|
)
|
|
|
|
with patch(
|
|
"agentkit.core.verification_loop.VerificationLoop",
|
|
return_value=make_mock_vloop([make_verify_result(passed=True)]),
|
|
):
|
|
result = await engine.execute(
|
|
messages=[{"role": "user", "content": "do something"}],
|
|
)
|
|
|
|
assert result.output == "answer"
|
|
assert all(s.tool_name != "verification" for s in result.trajectory)
|
|
|
|
async def test_verify_fail_max_zero_no_reinjection(self):
|
|
"""max_reinjections=0 + verify 失败 → 仅记录 trajectory,不回灌 LLM。
|
|
|
|
这是当前行为的 characterization:gateway.chat 只被调用一次。
|
|
"""
|
|
gateway = make_mock_gateway([make_response("bad answer")])
|
|
engine = ReActEngine(
|
|
llm_gateway=gateway,
|
|
max_steps=3,
|
|
verification_enabled=True,
|
|
verification_commands=["false"],
|
|
max_reinjections=0,
|
|
)
|
|
|
|
with patch(
|
|
"agentkit.core.verification_loop.VerificationLoop",
|
|
return_value=make_mock_vloop([make_verify_result(passed=False)]),
|
|
):
|
|
result = await engine.execute(
|
|
messages=[{"role": "user", "content": "do something"}],
|
|
)
|
|
|
|
# LLM 只被调用一次(无回灌)
|
|
assert gateway.chat.await_count == 1
|
|
# 输出仍保留
|
|
assert result.output == "bad answer"
|
|
# trajectory 包含 verification step
|
|
verify_steps = [s for s in result.trajectory if s.tool_name == "verification"]
|
|
assert len(verify_steps) == 1
|
|
assert verify_steps[0].result["passed"] is False
|
|
|
|
|
|
# ── R1: 回灌后 LLM 自纠正 → 二次 verify 通过 ──────────
|
|
|
|
|
|
class TestVerifyReinjection:
|
|
"""verify 失败回灌 conversation,LLM 自纠正后二次 verify 通过。"""
|
|
|
|
async def test_first_fail_reinject_second_pass(self):
|
|
"""R1: verify 首次失败 → errors 注入 conversation → LLM 修正 → 二次 verify 通过。"""
|
|
gateway = make_mock_gateway(
|
|
[
|
|
make_response("bad code"), # 第一次:错误答案
|
|
make_response("fixed code"), # 第二次:修正后答案
|
|
]
|
|
)
|
|
engine = ReActEngine(
|
|
llm_gateway=gateway,
|
|
max_steps=5,
|
|
verification_enabled=True,
|
|
verification_commands=["pytest"],
|
|
max_reinjections=1, # 默认值,允许 1 次回灌
|
|
)
|
|
|
|
with patch(
|
|
"agentkit.core.verification_loop.VerificationLoop",
|
|
return_value=make_mock_vloop(
|
|
[
|
|
make_verify_result(passed=False), # 第一次 verify 失败
|
|
make_verify_result(passed=True), # 第二次 verify 通过
|
|
]
|
|
),
|
|
):
|
|
result = await engine.execute(
|
|
messages=[{"role": "user", "content": "write code"}],
|
|
)
|
|
|
|
# LLM 被调用两次
|
|
assert gateway.chat.await_count == 2
|
|
# 最终输出是修正后的答案
|
|
assert result.output == "fixed code"
|
|
# 二次 verify 通过,不追加 verification step
|
|
verify_steps = [s for s in result.trajectory if s.tool_name == "verification"]
|
|
assert len(verify_steps) == 0
|
|
|
|
async def test_reinjected_user_message_appears_in_conversation(self):
|
|
"""R1 集成:回灌的 user 消息出现在 conversation,含 errors 文本。"""
|
|
gateway = make_mock_gateway(
|
|
[
|
|
make_response("bad"),
|
|
make_response("good"),
|
|
]
|
|
)
|
|
engine = ReActEngine(
|
|
llm_gateway=gateway,
|
|
max_steps=5,
|
|
verification_enabled=True,
|
|
verification_commands=["pytest"],
|
|
max_reinjections=1,
|
|
)
|
|
|
|
with patch(
|
|
"agentkit.core.verification_loop.VerificationLoop",
|
|
return_value=make_mock_vloop(
|
|
[
|
|
make_verify_result(passed=False, errors=["AssertionError: x != y"]),
|
|
make_verify_result(passed=True),
|
|
]
|
|
),
|
|
):
|
|
await engine.execute(
|
|
messages=[{"role": "user", "content": "write code"}],
|
|
)
|
|
|
|
# 第二次 LLM 调用时,conversation 应包含回灌的 user 消息
|
|
second_call = gateway.chat.await_args_list[1]
|
|
msgs_sent = second_call.kwargs.get("messages") or second_call[1].get("messages")
|
|
reinjected = [
|
|
m for m in msgs_sent if m.get("role") == "user" and "验证失败" in m.get("content", "")
|
|
]
|
|
assert len(reinjected) >= 1
|
|
assert "AssertionError: x != y" in reinjected[-1]["content"]
|
|
|
|
|
|
# ── R2: 二次 verify 失败 → 中断返回错误 ──────────
|
|
|
|
|
|
class TestVerifyDoubleFailure:
|
|
"""verify 二次失败 → 中断,返回错误附 verify log。"""
|
|
|
|
async def test_second_fail_breaks_with_verify_log(self):
|
|
"""R2: 二次 verify 失败 → 中断,trajectory 含 verify log + errors。"""
|
|
gateway = make_mock_gateway(
|
|
[
|
|
make_response("bad v1"),
|
|
make_response("bad v2"),
|
|
]
|
|
)
|
|
engine = ReActEngine(
|
|
llm_gateway=gateway,
|
|
max_steps=5,
|
|
verification_enabled=True,
|
|
verification_commands=["pytest"],
|
|
max_reinjections=1,
|
|
)
|
|
|
|
with patch(
|
|
"agentkit.core.verification_loop.VerificationLoop",
|
|
return_value=make_mock_vloop(
|
|
[
|
|
make_verify_result(passed=False, errors=["err1"]),
|
|
make_verify_result(passed=False, errors=["err2"]),
|
|
]
|
|
),
|
|
):
|
|
result = await engine.execute(
|
|
messages=[{"role": "user", "content": "write code"}],
|
|
)
|
|
|
|
# LLM 被调用两次(initial + 1 reinjection)
|
|
assert gateway.chat.await_count == 2
|
|
# 状态标记 verify 失败
|
|
assert result.status == "verify_failed"
|
|
# 输出保留(LLM 最后的答案)
|
|
assert result.output == "bad v2"
|
|
# trajectory 含 verification step with errors
|
|
verify_steps = [s for s in result.trajectory if s.tool_name == "verification"]
|
|
assert len(verify_steps) == 1
|
|
assert verify_steps[0].result["passed"] is False
|
|
assert "err2" in verify_steps[0].result["errors"]
|
|
|
|
|
|
# ── R3: 配置 + 边界 ──────────
|
|
|
|
|
|
class TestVerifyReinjectionConfig:
|
|
"""max_reinjections 配置测试。"""
|
|
|
|
def test_default_max_reinjections_is_one(self):
|
|
"""R14 self-check: max_reinjections 默认值为 1。"""
|
|
gateway = make_mock_gateway([])
|
|
engine = ReActEngine(llm_gateway=gateway)
|
|
assert engine._max_reinjections == 1
|
|
|
|
async def test_max_reinjections_zero_skips_reinjection(self):
|
|
"""R3: max_reinjections=0 → 等价于不回灌(当前行为)。"""
|
|
gateway = make_mock_gateway([make_response("only answer")])
|
|
engine = ReActEngine(
|
|
llm_gateway=gateway,
|
|
max_steps=5,
|
|
verification_enabled=True,
|
|
verification_commands=["false"],
|
|
max_reinjections=0,
|
|
)
|
|
|
|
with patch(
|
|
"agentkit.core.verification_loop.VerificationLoop",
|
|
return_value=make_mock_vloop([make_verify_result(passed=False)]),
|
|
):
|
|
result = await engine.execute(
|
|
messages=[{"role": "user", "content": "do something"}],
|
|
)
|
|
|
|
assert gateway.chat.await_count == 1 # 无回灌
|
|
assert result.output == "only answer"
|
|
|
|
async def test_reinjection_hits_max_steps_interrupts(self):
|
|
"""R3 edge: 回灌期间达到 max_steps → 中断(不无限循环)。"""
|
|
# max_steps=2, max_reinjections=5(max_steps 先到)
|
|
# LLM 调用 1:final answer → verify 失败 → reinject
|
|
# LLM 调用 2:final answer → verify 失败 → step=2=max_steps → 中断
|
|
gateway = make_mock_gateway(
|
|
[
|
|
make_response("ans1"),
|
|
make_response("ans2"),
|
|
]
|
|
)
|
|
engine = ReActEngine(
|
|
llm_gateway=gateway,
|
|
max_steps=2,
|
|
verification_enabled=True,
|
|
verification_commands=["false"],
|
|
max_reinjections=5, # 远大于 max_steps,验证 max_steps 优先
|
|
)
|
|
|
|
with patch(
|
|
"agentkit.core.verification_loop.VerificationLoop",
|
|
return_value=make_mock_vloop(
|
|
[
|
|
make_verify_result(passed=False),
|
|
make_verify_result(passed=False),
|
|
]
|
|
),
|
|
):
|
|
result = await engine.execute(
|
|
messages=[{"role": "user", "content": "do something"}],
|
|
)
|
|
|
|
# LLM 被调用 2 次(受 max_steps=2 限制)
|
|
assert gateway.chat.await_count == 2
|
|
# 第二次 verify 失败,因 max_reinjections=5 未到但 max_steps 到了
|
|
# → 应中断(verify_failed 或 partial)
|
|
assert result.status in ("verify_failed", "partial")
|
|
|
|
|
|
# ── execute_stream 回灌 ──────────
|
|
|
|
|
|
class TestVerifyReinjectionStream:
|
|
"""execute_stream 模式下的回灌行为。"""
|
|
|
|
async def test_stream_reinjection_first_fail_second_pass(self):
|
|
"""R1 stream: verify 首次失败 → 回灌 → 二次通过。"""
|
|
from agentkit.llm.protocol import StreamChunk
|
|
|
|
def make_stream_chunks(content: str):
|
|
"""返回一个返回 chunk 列表的 async generator factory。"""
|
|
|
|
async def _stream(**kwargs):
|
|
# Simulate streaming: yield content in 2 chunks
|
|
mid = len(content) // 2
|
|
yield StreamChunk(content=content[:mid], model="test-model")
|
|
yield StreamChunk(content=content[mid:], model="test-model")
|
|
|
|
return _stream
|
|
|
|
gateway = MagicMock(spec=LLMGateway)
|
|
gateway.chat_stream = MagicMock(
|
|
side_effect=[
|
|
make_stream_chunks("bad code")(),
|
|
make_stream_chunks("fixed code")(),
|
|
]
|
|
)
|
|
gateway.get_provider_name_for_model = MagicMock(return_value=None)
|
|
|
|
engine = ReActEngine(
|
|
llm_gateway=gateway,
|
|
max_steps=5,
|
|
verification_enabled=True,
|
|
verification_commands=["pytest"],
|
|
max_reinjections=1,
|
|
)
|
|
|
|
with patch(
|
|
"agentkit.core.verification_loop.VerificationLoop",
|
|
return_value=make_mock_vloop(
|
|
[
|
|
make_verify_result(passed=False),
|
|
make_verify_result(passed=True),
|
|
]
|
|
),
|
|
):
|
|
events = []
|
|
async for event in engine.execute_stream(
|
|
messages=[{"role": "user", "content": "write code"}],
|
|
):
|
|
events.append(event)
|
|
|
|
# chat_stream 被调用两次
|
|
assert gateway.chat_stream.call_count == 2
|
|
# 有 final_answer 事件
|
|
final_events = [e for e in events if e.event_type == "final_answer"]
|
|
assert len(final_events) >= 1
|
|
# 最终输出是修正后的答案
|
|
assert "fixed code" in final_events[-1].data.get("output", "")
|