fischer-agentkit/tests/integration/test_reflexion_loop.py

293 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""集成测试 - Reflexion 多轮循环
测试 ReflexionEngine 的 Evaluate→Reflect→Retry 循环。
仅 mock LLMGateway外部 API使用真实 ReflexionEngine 实例。
"""
from __future__ import annotations
from unittest.mock import AsyncMock, MagicMock
import pytest
from agentkit.core.react import ReActEngine, ReActStep
from agentkit.core.reflexion import ReflexionEngine, ReflexionReflection, ReflexionResult
from agentkit.llm.gateway import LLMGateway
from agentkit.llm.protocol import LLMResponse, TokenUsage
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def make_response(
content: str = "",
prompt_tokens: int = 10,
completion_tokens: int = 20,
) -> LLMResponse:
return LLMResponse(
content=content,
model="test-model",
usage=TokenUsage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
),
)
def make_mock_gateway(responses: list[LLMResponse]) -> MagicMock:
gateway = MagicMock(spec=LLMGateway)
gateway.chat = AsyncMock(side_effect=responses)
return gateway
# ---------------------------------------------------------------------------
# Test 1: First attempt scores high → no retry, returns result
# ---------------------------------------------------------------------------
class TestReflexionFirstAttemptPasses:
"""首次尝试分数高于阈值,无需重试"""
@pytest.mark.asyncio
async def test_high_score_no_retry(self):
gateway = make_mock_gateway([
# ReAct call
make_response(content="The answer is 42"),
# Evaluation call - high score
make_response(content='```json\n{"score": 0.9, "reasoning": "Excellent"}\n```'),
])
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
result = await engine.execute(
messages=[{"role": "user", "content": "What is the answer?"}],
)
assert isinstance(result, ReflexionResult)
assert result.output == "The answer is 42"
assert result.evaluation_score == 0.9
assert result.reflection_count == 0
assert len(result.reflections) == 0
assert result.status == "success"
@pytest.mark.asyncio
async def test_score_exactly_at_threshold(self):
"""分数恰好等于阈值,无需重试"""
gateway = make_mock_gateway([
make_response(content="Answer"),
make_response(content='```json\n{"score": 0.7, "reasoning": "OK"}\n```'),
])
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
result = await engine.execute(
messages=[{"role": "user", "content": "Task"}],
)
assert result.evaluation_score == 0.7
assert result.reflection_count == 0
# ---------------------------------------------------------------------------
# Test 2: First attempt scores low, second scores high → returns best result
# ---------------------------------------------------------------------------
class TestReflexionRetryImprovesScore:
"""首次低分,反思后重试高分"""
@pytest.mark.asyncio
async def test_reflection_and_retry_on_low_score(self):
gateway = make_mock_gateway([
# 1st ReAct call
make_response(content="Initial poor answer"),
# 1st Evaluation - low score
make_response(content='```json\n{"score": 0.3, "reasoning": "Incomplete"}\n```'),
# 1st Reflection call
make_response(content="You need to be more specific and provide detailed analysis."),
# 2nd ReAct call
make_response(content="Improved detailed answer"),
# 2nd Evaluation - high score
make_response(content='```json\n{"score": 0.85, "reasoning": "Good improvement"}\n```'),
])
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
result = await engine.execute(
messages=[{"role": "user", "content": "Analyze this"}],
)
assert result.output == "Improved detailed answer"
assert result.evaluation_score == 0.85
assert result.reflection_count == 1
assert len(result.reflections) == 1
assert result.reflections[0].score_before == 0.3
assert result.reflections[0].retry_number == 1
assert "specific" in result.reflections[0].reflection_text.lower() or "detailed" in result.reflections[0].reflection_text.lower()
@pytest.mark.asyncio
async def test_multiple_retries_improve_score(self):
"""多次重试后分数逐步提升"""
gateway = make_mock_gateway([
# Attempt 1
make_response(content="Bad answer"),
make_response(content='```json\n{"score": 0.2}\n```'),
make_response(content="Need more depth"),
# Attempt 2
make_response(content="Better answer"),
make_response(content='```json\n{"score": 0.5}\n```'),
make_response(content="Still needs improvement"),
# Attempt 3
make_response(content="Great answer"),
make_response(content='```json\n{"score": 0.9}\n```'),
])
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7, max_reflections=3)
result = await engine.execute(
messages=[{"role": "user", "content": "Complex task"}],
)
assert result.output == "Great answer"
assert result.evaluation_score == 0.9
assert result.reflection_count == 2
assert len(result.reflections) == 2
assert result.reflections[0].retry_number == 1
assert result.reflections[1].retry_number == 2
# ---------------------------------------------------------------------------
# Test 3: Max reflections reached → returns best result found
# ---------------------------------------------------------------------------
class TestReflexionMaxReflectionsReached:
"""达到最大反思次数后返回最佳结果"""
@pytest.mark.asyncio
async def test_returns_best_result_when_max_reflections_reached(self):
gateway = make_mock_gateway([
# Attempt 1
make_response(content="Poor answer"),
make_response(content='```json\n{"score": 0.3}\n```'),
make_response(content="Try harder"),
# Attempt 2
make_response(content="Slightly better answer"),
make_response(content='```json\n{"score": 0.5}\n```'),
make_response(content="Still not good enough"),
# Attempt 3 (max)
make_response(content="Another answer"),
make_response(content='```json\n{"score": 0.6}\n```'),
])
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7, max_reflections=3)
result = await engine.execute(
messages=[{"role": "user", "content": "Hard task"}],
)
# Should return the best result (score 0.6 from last attempt)
assert result.evaluation_score == 0.6
assert result.reflection_count == 2
assert result.output == "Another answer"
@pytest.mark.asyncio
async def test_returns_earlier_best_when_later_worse(self):
"""后续尝试分数更低时,返回之前最佳结果"""
gateway = make_mock_gateway([
# Attempt 1: score 0.5
make_response(content="Decent answer"),
make_response(content='```json\n{"score": 0.5}\n```'),
make_response(content="Try to improve"),
# Attempt 2: score 0.4 (worse)
make_response(content="Worse answer"),
make_response(content='```json\n{"score": 0.4}\n```'),
make_response(content="Still trying"),
# Attempt 3: score 0.45 (still worse than attempt 1)
make_response(content="Another answer"),
make_response(content='```json\n{"score": 0.45}\n```'),
# Reflection for attempt 3 (consumed but loop ends)
make_response(content="Final reflection"),
])
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7, max_reflections=3)
result = await engine.execute(
messages=[{"role": "user", "content": "Task"}],
)
# Best score was 0.5 from attempt 1
assert result.evaluation_score == 0.5
assert result.output == "Decent answer"
# ---------------------------------------------------------------------------
# Test 4: Reflection text improves subsequent attempts
# ---------------------------------------------------------------------------
class TestReflexionReflectionImprovesAttempts:
"""反思文本改善后续尝试"""
@pytest.mark.asyncio
async def test_reflection_injected_into_system_prompt(self):
"""反思文本被注入到下一次 ReAct 的 system prompt 中"""
gateway = make_mock_gateway([
make_response(content="Poor answer"),
make_response(content='```json\n{"score": 0.3}\n```'),
make_response(content="You need to provide more specific details."),
make_response(content="Better answer with details"),
make_response(content='```json\n{"score": 0.9}\n```'),
])
engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)
result = await engine.execute(
messages=[{"role": "user", "content": "Task"}],
system_prompt="You are a helpful assistant",
)
assert result.reflection_count == 1
assert result.evaluation_score == 0.9
assert result.output == "Better answer with details"
# Verify the reflection was recorded with correct metadata
assert result.reflections[0].reflection_text == "You need to provide more specific details."
assert result.reflections[0].score_before == 0.3
@pytest.mark.asyncio
async def test_reflexion_composes_react_engine(self):
"""ReflexionEngine 组合而非继承ReActEngine"""
gateway = make_mock_gateway([
make_response(content="Answer"),
make_response(content='```json\n{"score": 0.9}\n```'),
])
engine = ReflexionEngine(llm_gateway=gateway)
assert hasattr(engine, "_react_engine")
assert isinstance(engine._react_engine, ReActEngine)
assert not isinstance(engine, ReActEngine)
@pytest.mark.asyncio
async def test_reflexion_result_has_all_fields(self):
"""ReflexionResult 包含所有必要字段"""
gateway = make_mock_gateway([
make_response(content="Answer"),
make_response(content='```json\n{"score": 0.85}\n```'),
])
engine = ReflexionEngine(llm_gateway=gateway)
result = await engine.execute(
messages=[{"role": "user", "content": "Task"}],
)
# ReActResult fields
assert hasattr(result, "output")
assert hasattr(result, "trajectory")
assert hasattr(result, "total_steps")
assert hasattr(result, "total_tokens")
assert hasattr(result, "status")
# ReflexionResult additional fields
assert hasattr(result, "evaluation_score")
assert hasattr(result, "reflection_count")
assert hasattr(result, "reflections")
# All trajectory steps are ReActStep
assert all(isinstance(step, ReActStep) for step in result.trajectory)