763 lines
28 KiB
Python
763 lines
28 KiB
Python
"""Reflexion Engine 单元测试"""
|
||
|
||
import asyncio
|
||
import json
|
||
from unittest.mock import AsyncMock, MagicMock, patch
|
||
|
||
import pytest
|
||
|
||
from agentkit.core.exceptions import TaskCancelledError, TaskTimeoutError
|
||
from agentkit.core.protocol import CancellationToken
|
||
from agentkit.core.react import ReActEngine, ReActResult, ReActStep
|
||
from agentkit.core.reflexion import ReflexionEngine, ReflexionReflection, ReflexionResult
|
||
from agentkit.llm.gateway import LLMGateway
|
||
from agentkit.llm.protocol import LLMResponse, TokenUsage, ToolCall
|
||
from agentkit.tools.base import Tool
|
||
|
||
|
||
# ── Test Helpers ──────────────────────────────────────────
|
||
|
||
|
||
class FakeTool(Tool):
|
||
"""用于测试的 Fake Tool"""
|
||
|
||
def __init__(
|
||
self,
|
||
name: str = "fake_tool",
|
||
description: str = "A fake tool for testing",
|
||
result: dict | None = None,
|
||
should_fail: bool = False,
|
||
):
|
||
super().__init__(name=name, description=description)
|
||
self._result = result or {"status": "ok"}
|
||
self._should_fail = should_fail
|
||
|
||
async def execute(self, **kwargs) -> dict:
|
||
if self._should_fail:
|
||
raise RuntimeError(f"Tool '{self.name}' execution failed")
|
||
return self._result
|
||
|
||
|
||
def make_mock_gateway(responses: list[LLMResponse]) -> MagicMock:
|
||
"""创建一个 mock LLMGateway,按顺序返回给定响应"""
|
||
gateway = MagicMock(spec=LLMGateway)
|
||
gateway.chat = AsyncMock(side_effect=responses)
|
||
return gateway
|
||
|
||
|
||
def make_response(
|
||
content: str = "",
|
||
tool_calls: list[ToolCall] | None = None,
|
||
prompt_tokens: int = 10,
|
||
completion_tokens: int = 20,
|
||
) -> LLMResponse:
|
||
"""快速构造 LLMResponse"""
|
||
return LLMResponse(
|
||
content=content,
|
||
model="test-model",
|
||
usage=TokenUsage(
|
||
prompt_tokens=prompt_tokens,
|
||
completion_tokens=completion_tokens,
|
||
),
|
||
tool_calls=tool_calls or [],
|
||
)
|
||
|
||
|
||
def make_react_result(
|
||
output: str = "test output",
|
||
total_steps: int = 1,
|
||
total_tokens: int = 30,
|
||
status: str = "success",
|
||
) -> ReActResult:
|
||
"""快速构造 ReActResult"""
|
||
return ReActResult(
|
||
output=output,
|
||
trajectory=[ReActStep(step=1, action="final_answer", content=output, tokens=total_tokens)],
|
||
total_steps=total_steps,
|
||
total_tokens=total_tokens,
|
||
status=status,
|
||
)
|
||
|
||
|
||
# ── Test Classes ──────────────────────────────────────────
|
||
|
||
|
||
class TestReflexionFirstExecutionPasses:
|
||
"""首次执行即通过质量阈值,无需重试"""
|
||
|
||
async def test_no_retry_when_score_above_threshold(self):
|
||
gateway = make_mock_gateway([
|
||
# ReAct call
|
||
make_response(content="The answer is 42"),
|
||
# Evaluation call
|
||
make_response(content='```json\n{"score": 0.9, "reasoning": "Excellent"}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "What is the answer?"}],
|
||
)
|
||
|
||
assert isinstance(result, ReflexionResult)
|
||
assert result.output == "The answer is 42"
|
||
assert result.evaluation_score == 0.9
|
||
assert result.reflection_count == 0
|
||
assert len(result.reflections) == 0
|
||
assert result.status == "success"
|
||
|
||
async def test_score_exactly_at_threshold(self):
|
||
gateway = make_mock_gateway([
|
||
make_response(content="Answer"),
|
||
make_response(content='```json\n{"score": 0.7, "reasoning": "OK"}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
)
|
||
|
||
assert result.evaluation_score == 0.7
|
||
assert result.reflection_count == 0
|
||
|
||
|
||
class TestReflexionLowScoreTriggersReflection:
|
||
"""评估分数低于阈值时触发反思和重试"""
|
||
|
||
async def test_reflection_and_retry_on_low_score(self):
|
||
gateway = make_mock_gateway([
|
||
# 1st ReAct call
|
||
make_response(content="Initial poor answer"),
|
||
# 1st Evaluation call - low score
|
||
make_response(content='```json\n{"score": 0.3, "reasoning": "Incomplete"}\n```'),
|
||
# 1st Reflection call
|
||
make_response(content="You need to be more specific and provide detailed analysis."),
|
||
# 2nd ReAct call
|
||
make_response(content="Improved detailed answer"),
|
||
# 2nd Evaluation call - high score
|
||
make_response(content='```json\n{"score": 0.85, "reasoning": "Good improvement"}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Analyze this"}],
|
||
)
|
||
|
||
assert result.output == "Improved detailed answer"
|
||
assert result.evaluation_score == 0.85
|
||
assert result.reflection_count == 1
|
||
assert len(result.reflections) == 1
|
||
assert result.reflections[0].score_before == 0.3
|
||
assert result.reflections[0].retry_number == 1
|
||
assert "specific" in result.reflections[0].reflection_text.lower() or "detailed" in result.reflections[0].reflection_text.lower()
|
||
|
||
|
||
class TestReflexionRetryImprovesScore:
|
||
"""重试后分数提升,返回最终结果"""
|
||
|
||
async def test_multiple_retries_improve_score(self):
|
||
gateway = make_mock_gateway([
|
||
# Attempt 1
|
||
make_response(content="Bad answer"),
|
||
make_response(content='```json\n{"score": 0.2}\n```'),
|
||
make_response(content="Need more depth"),
|
||
# Attempt 2
|
||
make_response(content="Better answer"),
|
||
make_response(content='```json\n{"score": 0.5}\n```'),
|
||
make_response(content="Still needs improvement"),
|
||
# Attempt 3
|
||
make_response(content="Great answer"),
|
||
make_response(content='```json\n{"score": 0.9}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7, max_reflections=3)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Complex task"}],
|
||
)
|
||
|
||
assert result.output == "Great answer"
|
||
assert result.evaluation_score == 0.9
|
||
assert result.reflection_count == 2
|
||
assert len(result.reflections) == 2
|
||
assert result.reflections[0].retry_number == 1
|
||
assert result.reflections[1].retry_number == 2
|
||
|
||
|
||
class TestReflexionMaxReflectionsReached:
|
||
"""达到最大反思次数后返回最佳结果"""
|
||
|
||
async def test_returns_best_result_when_max_reflections_reached(self):
|
||
gateway = make_mock_gateway([
|
||
# Attempt 1
|
||
make_response(content="Poor answer"),
|
||
make_response(content='```json\n{"score": 0.3}\n```'),
|
||
make_response(content="Try harder"),
|
||
# Attempt 2
|
||
make_response(content="Slightly better answer"),
|
||
make_response(content='```json\n{"score": 0.5}\n```'),
|
||
make_response(content="Still not good enough"),
|
||
# Attempt 3 (max)
|
||
make_response(content="Another answer"),
|
||
make_response(content='```json\n{"score": 0.6}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7, max_reflections=3)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Hard task"}],
|
||
)
|
||
|
||
# Should return the best result (score 0.6 from last attempt)
|
||
assert result.evaluation_score == 0.6
|
||
assert result.reflection_count == 2
|
||
assert result.output == "Another answer"
|
||
|
||
|
||
class TestReflexionEvaluationFailure:
|
||
"""评估 LLM 调用失败时回退到中性分数"""
|
||
|
||
async def test_evaluation_failure_falls_back_to_neutral_score(self):
|
||
"""评估失败时使用 0.5 中性分数,低于阈值则触发反思和重试"""
|
||
call_count = 0
|
||
|
||
async def chat_side_effect(**kwargs):
|
||
nonlocal call_count
|
||
call_count += 1
|
||
if call_count == 1:
|
||
# ReAct call
|
||
return make_response(content="Some answer")
|
||
elif call_count == 2:
|
||
# Evaluation call - fails
|
||
raise RuntimeError("LLM unavailable")
|
||
elif call_count == 3:
|
||
# Reflection call (0.5 < 0.7 triggers reflection)
|
||
return make_response(content="Try to be more detailed")
|
||
elif call_count == 4:
|
||
# 2nd ReAct call
|
||
return make_response(content="Better answer")
|
||
else:
|
||
# 2nd Evaluation call - succeeds
|
||
return make_response(content='```json\n{"score": 0.9}\n```')
|
||
|
||
gateway = MagicMock(spec=LLMGateway)
|
||
gateway.chat = AsyncMock(side_effect=chat_side_effect)
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
)
|
||
|
||
# Evaluation failure should be handled gracefully
|
||
# Neutral score 0.5 < 0.7 triggers reflection and retry
|
||
assert isinstance(result, ReflexionResult)
|
||
assert result.output == "Better answer"
|
||
assert result.evaluation_score == 0.9
|
||
assert result.reflection_count == 1
|
||
|
||
async def test_evaluation_failure_returns_neutral_score(self):
|
||
"""验证评估失败时确实使用了 0.5 中性分数"""
|
||
call_count = 0
|
||
|
||
async def chat_side_effect(**kwargs):
|
||
nonlocal call_count
|
||
call_count += 1
|
||
if call_count == 1:
|
||
return make_response(content="Answer")
|
||
elif call_count == 2:
|
||
raise RuntimeError("Evaluation failed")
|
||
elif call_count == 3:
|
||
return make_response(content="Reflection text")
|
||
elif call_count == 4:
|
||
return make_response(content="Better answer")
|
||
else:
|
||
return make_response(content='```json\n{"score": 0.9}\n```')
|
||
|
||
gateway = MagicMock(spec=LLMGateway)
|
||
gateway.chat = AsyncMock(side_effect=chat_side_effect)
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
)
|
||
|
||
# Should have triggered reflection (0.5 < 0.7) and retried
|
||
assert result.reflection_count >= 1
|
||
|
||
|
||
class TestReflexionReflectionFailure:
|
||
"""反思 LLM 调用失败时返回当前结果"""
|
||
|
||
async def test_reflection_failure_returns_current_result(self):
|
||
call_count = 0
|
||
|
||
async def chat_side_effect(**kwargs):
|
||
nonlocal call_count
|
||
call_count += 1
|
||
if call_count == 1:
|
||
# ReAct call
|
||
return make_response(content="Initial answer")
|
||
elif call_count == 2:
|
||
# Evaluation call - low score
|
||
return make_response(content='```json\n{"score": 0.3}\n```')
|
||
elif call_count == 3:
|
||
# Reflection call - fails
|
||
raise RuntimeError("Reflection LLM unavailable")
|
||
else:
|
||
return make_response(content="Should not reach here")
|
||
|
||
gateway = MagicMock(spec=LLMGateway)
|
||
gateway.chat = AsyncMock(side_effect=chat_side_effect)
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
)
|
||
|
||
# Should return current result without crashing
|
||
assert isinstance(result, ReflexionResult)
|
||
assert result.output == "Initial answer"
|
||
assert result.evaluation_score == 0.3
|
||
assert result.reflection_count == 0 # Reflection failed, not recorded
|
||
|
||
|
||
class TestReflexionCancellationToken:
|
||
"""取消令牌测试"""
|
||
|
||
async def test_cancelled_before_execution(self):
|
||
gateway = make_mock_gateway([
|
||
make_response(content="Answer"),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway)
|
||
|
||
token = CancellationToken()
|
||
token.cancel()
|
||
|
||
with pytest.raises(TaskCancelledError):
|
||
await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
cancellation_token=token,
|
||
)
|
||
|
||
async def test_cancelled_mid_execution(self):
|
||
call_count = 0
|
||
|
||
async def chat_side_effect(**kwargs):
|
||
nonlocal call_count
|
||
call_count += 1
|
||
if call_count >= 2:
|
||
# Simulate cancel after first ReAct + evaluation
|
||
pass
|
||
return make_response(content="Answer")
|
||
|
||
gateway = MagicMock(spec=LLMGateway)
|
||
gateway.chat = AsyncMock(side_effect=chat_side_effect)
|
||
engine = ReflexionEngine(llm_gateway=gateway)
|
||
|
||
token = CancellationToken()
|
||
# Pre-cancel to test the check at the beginning of the loop
|
||
token.cancel()
|
||
|
||
with pytest.raises(TaskCancelledError):
|
||
await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
cancellation_token=token,
|
||
)
|
||
|
||
async def test_uncancelled_token_works_normally(self):
|
||
gateway = make_mock_gateway([
|
||
make_response(content="Answer"),
|
||
make_response(content='```json\n{"score": 0.9}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway)
|
||
|
||
token = CancellationToken()
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
cancellation_token=token,
|
||
)
|
||
|
||
assert result.output == "Answer"
|
||
assert result.evaluation_score == 0.9
|
||
|
||
|
||
class TestReflexionInterfaceCompatibility:
|
||
"""接口兼容性测试"""
|
||
|
||
async def test_same_parameter_signature_as_react(self):
|
||
"""ReflexionEngine.execute() 接受与 ReActEngine 相同的参数"""
|
||
gateway = make_mock_gateway([
|
||
make_response(content="Answer"),
|
||
make_response(content='```json\n{"score": 0.8}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway)
|
||
|
||
# Should accept all the same parameters as ReActEngine
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
tools=None,
|
||
model="gpt-4",
|
||
agent_name="test_agent",
|
||
task_type="analysis",
|
||
system_prompt="You are helpful",
|
||
trace_recorder=None,
|
||
memory_retriever=None,
|
||
task_id="task-123",
|
||
compressor=None,
|
||
retrieval_config=None,
|
||
cancellation_token=None,
|
||
timeout_seconds=300,
|
||
)
|
||
|
||
assert isinstance(result, ReflexionResult)
|
||
|
||
async def test_reflexion_result_has_react_result_fields(self):
|
||
"""ReflexionResult 包含 ReActResult 的所有字段"""
|
||
gateway = make_mock_gateway([
|
||
make_response(content="Answer"),
|
||
make_response(content='```json\n{"score": 0.85}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
)
|
||
|
||
# ReActResult fields
|
||
assert hasattr(result, "output")
|
||
assert hasattr(result, "trajectory")
|
||
assert hasattr(result, "total_steps")
|
||
assert hasattr(result, "total_tokens")
|
||
assert hasattr(result, "status")
|
||
|
||
# ReflexionResult additional fields
|
||
assert hasattr(result, "evaluation_score")
|
||
assert hasattr(result, "reflection_count")
|
||
assert hasattr(result, "reflections")
|
||
|
||
async def test_reflexion_composes_react_engine(self):
|
||
"""ReflexionEngine 组合(而非继承)ReActEngine"""
|
||
gateway = make_mock_gateway([
|
||
make_response(content="Answer"),
|
||
make_response(content='```json\n{"score": 0.9}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway)
|
||
|
||
# Should have a _react_engine attribute (composition)
|
||
assert hasattr(engine, "_react_engine")
|
||
assert isinstance(engine._react_engine, ReActEngine)
|
||
# Should NOT be a subclass of ReActEngine
|
||
assert not isinstance(engine, ReActEngine)
|
||
|
||
async def test_reflexion_result_trajectory_uses_react_step(self):
|
||
"""ReflexionResult.trajectory 使用 ReActStep 类型"""
|
||
gateway = make_mock_gateway([
|
||
make_response(content="Answer"),
|
||
make_response(content='```json\n{"score": 0.9}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
)
|
||
|
||
assert all(isinstance(step, ReActStep) for step in result.trajectory)
|
||
|
||
|
||
class TestReflexionLayeredModels:
|
||
"""分层模型测试"""
|
||
|
||
async def test_default_models_same_as_input(self):
|
||
"""默认情况下 evaluate_model 和 reflect_model 与 act_model 相同"""
|
||
gateway = make_mock_gateway([
|
||
make_response(content="Answer"),
|
||
make_response(content='```json\n{"score": 0.9}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
model="gpt-4",
|
||
)
|
||
|
||
# Verify evaluation call used the same model
|
||
# The 2nd call should be the evaluation call
|
||
eval_call = gateway.chat.call_args_list[1]
|
||
assert eval_call.kwargs.get("model") == "gpt-4"
|
||
|
||
async def test_separate_evaluate_model(self):
|
||
"""使用独立的 evaluate_model"""
|
||
gateway = make_mock_gateway([
|
||
make_response(content="Answer"),
|
||
make_response(content='```json\n{"score": 0.9}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
model="gpt-3.5",
|
||
evaluate_model="gpt-4",
|
||
)
|
||
|
||
# Evaluation call should use gpt-4
|
||
eval_call = gateway.chat.call_args_list[1]
|
||
assert eval_call.kwargs.get("model") == "gpt-4"
|
||
|
||
async def test_separate_reflect_model(self):
|
||
"""使用独立的 reflect_model"""
|
||
gateway = make_mock_gateway([
|
||
make_response(content="Poor answer"),
|
||
make_response(content='```json\n{"score": 0.3}\n```'),
|
||
make_response(content="Reflection text"),
|
||
make_response(content="Better answer"),
|
||
make_response(content='```json\n{"score": 0.9}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
model="gpt-3.5",
|
||
evaluate_model="gpt-4",
|
||
reflect_model="claude-3",
|
||
)
|
||
|
||
# Reflection call (3rd call) should use claude-3
|
||
reflect_call = gateway.chat.call_args_list[2]
|
||
assert reflect_call.kwargs.get("model") == "claude-3"
|
||
|
||
|
||
class TestReflexionConstructorValidation:
|
||
"""构造函数参数验证"""
|
||
|
||
def test_invalid_max_steps(self):
|
||
gateway = MagicMock(spec=LLMGateway)
|
||
with pytest.raises(ValueError, match="max_steps"):
|
||
ReflexionEngine(llm_gateway=gateway, max_steps=0)
|
||
|
||
def test_invalid_max_reflections(self):
|
||
gateway = MagicMock(spec=LLMGateway)
|
||
with pytest.raises(ValueError, match="max_reflections"):
|
||
ReflexionEngine(llm_gateway=gateway, max_reflections=0)
|
||
|
||
def test_invalid_quality_threshold(self):
|
||
gateway = MagicMock(spec=LLMGateway)
|
||
with pytest.raises(ValueError, match="quality_threshold"):
|
||
ReflexionEngine(llm_gateway=gateway, quality_threshold=1.5)
|
||
|
||
def test_valid_construction(self):
|
||
gateway = MagicMock(spec=LLMGateway)
|
||
engine = ReflexionEngine(
|
||
llm_gateway=gateway,
|
||
max_steps=5,
|
||
max_reflections=2,
|
||
quality_threshold=0.8,
|
||
default_timeout=60.0,
|
||
)
|
||
assert engine._max_steps == 5
|
||
assert engine._max_reflections == 2
|
||
assert engine._quality_threshold == 0.8
|
||
assert engine._default_timeout == 60.0
|
||
|
||
|
||
class TestReflexionTimeout:
|
||
"""超时测试"""
|
||
|
||
async def test_timeout_raises_task_timeout_error(self):
|
||
async def slow_chat(**kwargs):
|
||
await asyncio.sleep(0.5)
|
||
return make_response(content="slow")
|
||
|
||
gateway = MagicMock(spec=LLMGateway)
|
||
gateway.chat = AsyncMock(side_effect=slow_chat)
|
||
engine = ReflexionEngine(llm_gateway=gateway)
|
||
|
||
with pytest.raises(TaskTimeoutError):
|
||
await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
timeout_seconds=0.3,
|
||
)
|
||
|
||
|
||
class TestReflexionEvaluationParsing:
|
||
"""评估分数解析测试"""
|
||
|
||
async def test_parse_score_from_json_code_block(self):
|
||
gateway = make_mock_gateway([
|
||
make_response(content="Answer"),
|
||
make_response(content='```json\n{"score": 0.85, "reasoning": "Good"}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
)
|
||
|
||
assert result.evaluation_score == 0.85
|
||
|
||
async def test_parse_score_from_plain_json(self):
|
||
gateway = make_mock_gateway([
|
||
make_response(content="Answer"),
|
||
make_response(content='{"score": 0.75, "reasoning": "OK"}'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
)
|
||
|
||
assert result.evaluation_score == 0.75
|
||
|
||
async def test_parse_score_from_text(self):
|
||
gateway = make_mock_gateway([
|
||
make_response(content="Answer"),
|
||
make_response(content='The score is 0.8 based on my evaluation.'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
)
|
||
|
||
assert result.evaluation_score == 0.8
|
||
|
||
async def test_score_clamped_to_range(self):
|
||
gateway = make_mock_gateway([
|
||
make_response(content="Answer"),
|
||
make_response(content='```json\n{"score": 1.5}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
)
|
||
|
||
# Score should be clamped to 1.0
|
||
assert result.evaluation_score == 1.0
|
||
|
||
|
||
class TestReflexionReflectionPrompt:
|
||
"""反思提示构建测试"""
|
||
|
||
async def test_reflection_injected_into_system_prompt(self):
|
||
"""验证反思文本被注入到下一次 ReAct 的 system prompt 中"""
|
||
gateway = make_mock_gateway([
|
||
make_response(content="Poor answer"),
|
||
make_response(content='```json\n{"score": 0.3}\n```'),
|
||
make_response(content="You need to provide more specific details."),
|
||
make_response(content="Better answer with details"),
|
||
make_response(content='```json\n{"score": 0.9}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
system_prompt="You are a helpful assistant",
|
||
)
|
||
|
||
# The 4th call (2nd ReAct) should have the reflection in system prompt
|
||
# Note: ReActEngine builds its own messages, so we check the gateway call
|
||
assert result.reflection_count == 1
|
||
assert result.evaluation_score == 0.9
|
||
|
||
|
||
class TestReflexionStreaming:
|
||
"""流式执行测试"""
|
||
|
||
async def test_execute_stream_yields_events(self):
|
||
"""execute_stream 产生正确的事件类型"""
|
||
gateway = MagicMock(spec=LLMGateway)
|
||
|
||
# Mock ReActEngine.execute_stream to yield events
|
||
async def mock_react_stream(**kwargs):
|
||
from agentkit.core.react import ReActEvent
|
||
yield ReActEvent(event_type="thinking", step=1, data={"message": "Thinking..."})
|
||
yield ReActEvent(event_type="final_answer", step=1, data={"output": "Answer", "total_steps": 1, "total_tokens": 30})
|
||
|
||
# Mock evaluation and reflection
|
||
gateway.chat = AsyncMock(side_effect=[
|
||
make_response(content='```json\n{"score": 0.9}\n```'),
|
||
])
|
||
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
|
||
|
||
with patch.object(engine._react_engine, "execute_stream", side_effect=mock_react_stream):
|
||
events = []
|
||
async for event in engine.execute_stream(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
):
|
||
events.append(event)
|
||
|
||
event_types = [e.event_type for e in events]
|
||
assert "executing" in event_types
|
||
assert "evaluating" in event_types
|
||
assert "evaluation_result" in event_types
|
||
assert "final_answer" in event_types
|
||
|
||
async def test_execute_stream_reflection_events(self):
|
||
"""execute_stream 在低分时产生反思和重试事件"""
|
||
gateway = MagicMock(spec=LLMGateway)
|
||
|
||
call_count = 0
|
||
|
||
async def mock_react_stream(**kwargs):
|
||
nonlocal call_count
|
||
call_count += 1
|
||
from agentkit.core.react import ReActEvent
|
||
if call_count == 1:
|
||
yield ReActEvent(event_type="final_answer", step=1, data={"output": "Poor answer", "total_steps": 1, "total_tokens": 30})
|
||
else:
|
||
yield ReActEvent(event_type="final_answer", step=1, data={"output": "Good answer", "total_steps": 1, "total_tokens": 30})
|
||
|
||
# Evaluation: first low, then high
|
||
gateway.chat = AsyncMock(side_effect=[
|
||
make_response(content='```json\n{"score": 0.3}\n```'), # 1st eval
|
||
make_response(content="Need improvement"), # reflection
|
||
make_response(content='```json\n{"score": 0.9}\n```'), # 2nd eval
|
||
])
|
||
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
|
||
|
||
with patch.object(engine._react_engine, "execute_stream", side_effect=mock_react_stream):
|
||
events = []
|
||
async for event in engine.execute_stream(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
):
|
||
events.append(event)
|
||
|
||
event_types = [e.event_type for e in events]
|
||
assert "executing" in event_types
|
||
assert "evaluating" in event_types
|
||
assert "evaluation_result" in event_types
|
||
assert "reflecting" in event_types
|
||
assert "reflection_result" in event_types
|
||
assert "retrying" in event_types
|
||
assert "final_answer" in event_types
|
||
|
||
|
||
class TestReflexionBestResultTracking:
|
||
"""最佳结果追踪测试"""
|
||
|
||
async def test_returns_best_result_across_attempts(self):
|
||
"""当后续尝试分数更低时,返回之前最佳的结果"""
|
||
gateway = make_mock_gateway([
|
||
# Attempt 1: score 0.5
|
||
make_response(content="Decent answer"),
|
||
make_response(content='```json\n{"score": 0.5}\n```'),
|
||
make_response(content="Try to improve"),
|
||
# Attempt 2: score 0.4 (worse)
|
||
make_response(content="Worse answer"),
|
||
make_response(content='```json\n{"score": 0.4}\n```'),
|
||
make_response(content="Still trying"),
|
||
# Attempt 3: score 0.45 (still worse than attempt 1)
|
||
make_response(content="Another answer"),
|
||
make_response(content='```json\n{"score": 0.45}\n```'),
|
||
# Reflection for attempt 3 (will be consumed but loop ends)
|
||
make_response(content="Final reflection"),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7, max_reflections=3)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
)
|
||
|
||
# Best score was 0.5 from attempt 1
|
||
assert result.evaluation_score == 0.5
|
||
assert result.output == "Decent answer"
|