293 lines
11 KiB
Python
293 lines
11 KiB
Python
"""集成测试 - Reflexion 多轮循环
|
||
|
||
测试 ReflexionEngine 的 Evaluate→Reflect→Retry 循环。
|
||
仅 mock LLMGateway(外部 API),使用真实 ReflexionEngine 实例。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from unittest.mock import AsyncMock, MagicMock
|
||
|
||
import pytest
|
||
|
||
from agentkit.core.react import ReActEngine, ReActStep
|
||
from agentkit.core.reflexion import ReflexionEngine, ReflexionReflection, ReflexionResult
|
||
from agentkit.llm.gateway import LLMGateway
|
||
from agentkit.llm.protocol import LLMResponse, TokenUsage
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def make_response(
|
||
content: str = "",
|
||
prompt_tokens: int = 10,
|
||
completion_tokens: int = 20,
|
||
) -> LLMResponse:
|
||
return LLMResponse(
|
||
content=content,
|
||
model="test-model",
|
||
usage=TokenUsage(
|
||
prompt_tokens=prompt_tokens,
|
||
completion_tokens=completion_tokens,
|
||
),
|
||
)
|
||
|
||
|
||
def make_mock_gateway(responses: list[LLMResponse]) -> MagicMock:
|
||
gateway = MagicMock(spec=LLMGateway)
|
||
gateway.chat = AsyncMock(side_effect=responses)
|
||
return gateway
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test 1: First attempt scores high → no retry, returns result
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestReflexionFirstAttemptPasses:
|
||
"""首次尝试分数高于阈值,无需重试"""
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_high_score_no_retry(self):
|
||
gateway = make_mock_gateway([
|
||
# ReAct call
|
||
make_response(content="The answer is 42"),
|
||
# Evaluation call - high score
|
||
make_response(content='```json\n{"score": 0.9, "reasoning": "Excellent"}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "What is the answer?"}],
|
||
)
|
||
|
||
assert isinstance(result, ReflexionResult)
|
||
assert result.output == "The answer is 42"
|
||
assert result.evaluation_score == 0.9
|
||
assert result.reflection_count == 0
|
||
assert len(result.reflections) == 0
|
||
assert result.status == "success"
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_score_exactly_at_threshold(self):
|
||
"""分数恰好等于阈值,无需重试"""
|
||
gateway = make_mock_gateway([
|
||
make_response(content="Answer"),
|
||
make_response(content='```json\n{"score": 0.7, "reasoning": "OK"}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
)
|
||
|
||
assert result.evaluation_score == 0.7
|
||
assert result.reflection_count == 0
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test 2: First attempt scores low, second scores high → returns best result
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestReflexionRetryImprovesScore:
|
||
"""首次低分,反思后重试高分"""
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_reflection_and_retry_on_low_score(self):
|
||
gateway = make_mock_gateway([
|
||
# 1st ReAct call
|
||
make_response(content="Initial poor answer"),
|
||
# 1st Evaluation - low score
|
||
make_response(content='```json\n{"score": 0.3, "reasoning": "Incomplete"}\n```'),
|
||
# 1st Reflection call
|
||
make_response(content="You need to be more specific and provide detailed analysis."),
|
||
# 2nd ReAct call
|
||
make_response(content="Improved detailed answer"),
|
||
# 2nd Evaluation - high score
|
||
make_response(content='```json\n{"score": 0.85, "reasoning": "Good improvement"}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Analyze this"}],
|
||
)
|
||
|
||
assert result.output == "Improved detailed answer"
|
||
assert result.evaluation_score == 0.85
|
||
assert result.reflection_count == 1
|
||
assert len(result.reflections) == 1
|
||
assert result.reflections[0].score_before == 0.3
|
||
assert result.reflections[0].retry_number == 1
|
||
assert "specific" in result.reflections[0].reflection_text.lower() or "detailed" in result.reflections[0].reflection_text.lower()
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_multiple_retries_improve_score(self):
|
||
"""多次重试后分数逐步提升"""
|
||
gateway = make_mock_gateway([
|
||
# Attempt 1
|
||
make_response(content="Bad answer"),
|
||
make_response(content='```json\n{"score": 0.2}\n```'),
|
||
make_response(content="Need more depth"),
|
||
# Attempt 2
|
||
make_response(content="Better answer"),
|
||
make_response(content='```json\n{"score": 0.5}\n```'),
|
||
make_response(content="Still needs improvement"),
|
||
# Attempt 3
|
||
make_response(content="Great answer"),
|
||
make_response(content='```json\n{"score": 0.9}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7, max_reflections=3)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Complex task"}],
|
||
)
|
||
|
||
assert result.output == "Great answer"
|
||
assert result.evaluation_score == 0.9
|
||
assert result.reflection_count == 2
|
||
assert len(result.reflections) == 2
|
||
assert result.reflections[0].retry_number == 1
|
||
assert result.reflections[1].retry_number == 2
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test 3: Max reflections reached → returns best result found
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestReflexionMaxReflectionsReached:
|
||
"""达到最大反思次数后返回最佳结果"""
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_returns_best_result_when_max_reflections_reached(self):
|
||
gateway = make_mock_gateway([
|
||
# Attempt 1
|
||
make_response(content="Poor answer"),
|
||
make_response(content='```json\n{"score": 0.3}\n```'),
|
||
make_response(content="Try harder"),
|
||
# Attempt 2
|
||
make_response(content="Slightly better answer"),
|
||
make_response(content='```json\n{"score": 0.5}\n```'),
|
||
make_response(content="Still not good enough"),
|
||
# Attempt 3 (max)
|
||
make_response(content="Another answer"),
|
||
make_response(content='```json\n{"score": 0.6}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7, max_reflections=3)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Hard task"}],
|
||
)
|
||
|
||
# Should return the best result (score 0.6 from last attempt)
|
||
assert result.evaluation_score == 0.6
|
||
assert result.reflection_count == 2
|
||
assert result.output == "Another answer"
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_returns_earlier_best_when_later_worse(self):
|
||
"""后续尝试分数更低时,返回之前最佳结果"""
|
||
gateway = make_mock_gateway([
|
||
# Attempt 1: score 0.5
|
||
make_response(content="Decent answer"),
|
||
make_response(content='```json\n{"score": 0.5}\n```'),
|
||
make_response(content="Try to improve"),
|
||
# Attempt 2: score 0.4 (worse)
|
||
make_response(content="Worse answer"),
|
||
make_response(content='```json\n{"score": 0.4}\n```'),
|
||
make_response(content="Still trying"),
|
||
# Attempt 3: score 0.45 (still worse than attempt 1)
|
||
make_response(content="Another answer"),
|
||
make_response(content='```json\n{"score": 0.45}\n```'),
|
||
# Reflection for attempt 3 (consumed but loop ends)
|
||
make_response(content="Final reflection"),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7, max_reflections=3)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
)
|
||
|
||
# Best score was 0.5 from attempt 1
|
||
assert result.evaluation_score == 0.5
|
||
assert result.output == "Decent answer"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test 4: Reflection text improves subsequent attempts
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestReflexionReflectionImprovesAttempts:
|
||
"""反思文本改善后续尝试"""
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_reflection_injected_into_system_prompt(self):
|
||
"""反思文本被注入到下一次 ReAct 的 system prompt 中"""
|
||
gateway = make_mock_gateway([
|
||
make_response(content="Poor answer"),
|
||
make_response(content='```json\n{"score": 0.3}\n```'),
|
||
make_response(content="You need to provide more specific details."),
|
||
make_response(content="Better answer with details"),
|
||
make_response(content='```json\n{"score": 0.9}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
system_prompt="You are a helpful assistant",
|
||
)
|
||
|
||
assert result.reflection_count == 1
|
||
assert result.evaluation_score == 0.9
|
||
assert result.output == "Better answer with details"
|
||
|
||
# Verify the reflection was recorded with correct metadata
|
||
assert result.reflections[0].reflection_text == "You need to provide more specific details."
|
||
assert result.reflections[0].score_before == 0.3
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_reflexion_composes_react_engine(self):
|
||
"""ReflexionEngine 组合(而非继承)ReActEngine"""
|
||
gateway = make_mock_gateway([
|
||
make_response(content="Answer"),
|
||
make_response(content='```json\n{"score": 0.9}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway)
|
||
|
||
assert hasattr(engine, "_react_engine")
|
||
assert isinstance(engine._react_engine, ReActEngine)
|
||
assert not isinstance(engine, ReActEngine)
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_reflexion_result_has_all_fields(self):
|
||
"""ReflexionResult 包含所有必要字段"""
|
||
gateway = make_mock_gateway([
|
||
make_response(content="Answer"),
|
||
make_response(content='```json\n{"score": 0.85}\n```'),
|
||
])
|
||
engine = ReflexionEngine(llm_gateway=gateway)
|
||
|
||
result = await engine.execute(
|
||
messages=[{"role": "user", "content": "Task"}],
|
||
)
|
||
|
||
# ReActResult fields
|
||
assert hasattr(result, "output")
|
||
assert hasattr(result, "trajectory")
|
||
assert hasattr(result, "total_steps")
|
||
assert hasattr(result, "total_tokens")
|
||
assert hasattr(result, "status")
|
||
|
||
# ReflexionResult additional fields
|
||
assert hasattr(result, "evaluation_score")
|
||
assert hasattr(result, "reflection_count")
|
||
assert hasattr(result, "reflections")
|
||
|
||
# All trajectory steps are ReActStep
|
||
assert all(isinstance(step, ReActStep) for step in result.trajectory)
|