"""集成测试 - Reflexion 多轮循环 测试 ReflexionEngine 的 Evaluate→Reflect→Retry 循环。 仅 mock LLMGateway(外部 API),使用真实 ReflexionEngine 实例。 """ from __future__ import annotations from unittest.mock import AsyncMock, MagicMock import pytest from agentkit.core.react import ReActEngine, ReActStep from agentkit.core.reflexion import ReflexionEngine, ReflexionReflection, ReflexionResult from agentkit.llm.gateway import LLMGateway from agentkit.llm.protocol import LLMResponse, TokenUsage # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def make_response( content: str = "", prompt_tokens: int = 10, completion_tokens: int = 20, ) -> LLMResponse: return LLMResponse( content=content, model="test-model", usage=TokenUsage( prompt_tokens=prompt_tokens, completion_tokens=completion_tokens, ), ) def make_mock_gateway(responses: list[LLMResponse]) -> MagicMock: gateway = MagicMock(spec=LLMGateway) gateway.chat = AsyncMock(side_effect=responses) return gateway # --------------------------------------------------------------------------- # Test 1: First attempt scores high → no retry, returns result # --------------------------------------------------------------------------- class TestReflexionFirstAttemptPasses: """首次尝试分数高于阈值,无需重试""" @pytest.mark.asyncio async def test_high_score_no_retry(self): gateway = make_mock_gateway([ # ReAct call make_response(content="The answer is 42"), # Evaluation call - high score make_response(content='```json\n{"score": 0.9, "reasoning": "Excellent"}\n```'), ]) engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7) result = await engine.execute( messages=[{"role": "user", "content": "What is the answer?"}], ) assert isinstance(result, ReflexionResult) assert result.output == "The answer is 42" assert result.evaluation_score == 0.9 assert result.reflection_count == 0 assert len(result.reflections) == 0 assert result.status == "success" @pytest.mark.asyncio async def test_score_exactly_at_threshold(self): """分数恰好等于阈值,无需重试""" gateway = make_mock_gateway([ make_response(content="Answer"), make_response(content='```json\n{"score": 0.7, "reasoning": "OK"}\n```'), ]) engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7) result = await engine.execute( messages=[{"role": "user", "content": "Task"}], ) assert result.evaluation_score == 0.7 assert result.reflection_count == 0 # --------------------------------------------------------------------------- # Test 2: First attempt scores low, second scores high → returns best result # --------------------------------------------------------------------------- class TestReflexionRetryImprovesScore: """首次低分,反思后重试高分""" @pytest.mark.asyncio async def test_reflection_and_retry_on_low_score(self): gateway = make_mock_gateway([ # 1st ReAct call make_response(content="Initial poor answer"), # 1st Evaluation - low score make_response(content='```json\n{"score": 0.3, "reasoning": "Incomplete"}\n```'), # 1st Reflection call make_response(content="You need to be more specific and provide detailed analysis."), # 2nd ReAct call make_response(content="Improved detailed answer"), # 2nd Evaluation - high score make_response(content='```json\n{"score": 0.85, "reasoning": "Good improvement"}\n```'), ]) engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7) result = await engine.execute( messages=[{"role": "user", "content": "Analyze this"}], ) assert result.output == "Improved detailed answer" assert result.evaluation_score == 0.85 assert result.reflection_count == 1 assert len(result.reflections) == 1 assert result.reflections[0].score_before == 0.3 assert result.reflections[0].retry_number == 1 assert "specific" in result.reflections[0].reflection_text.lower() or "detailed" in result.reflections[0].reflection_text.lower() @pytest.mark.asyncio async def test_multiple_retries_improve_score(self): """多次重试后分数逐步提升""" gateway = make_mock_gateway([ # Attempt 1 make_response(content="Bad answer"), make_response(content='```json\n{"score": 0.2}\n```'), make_response(content="Need more depth"), # Attempt 2 make_response(content="Better answer"), make_response(content='```json\n{"score": 0.5}\n```'), make_response(content="Still needs improvement"), # Attempt 3 make_response(content="Great answer"), make_response(content='```json\n{"score": 0.9}\n```'), ]) engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7, max_reflections=3) result = await engine.execute( messages=[{"role": "user", "content": "Complex task"}], ) assert result.output == "Great answer" assert result.evaluation_score == 0.9 assert result.reflection_count == 2 assert len(result.reflections) == 2 assert result.reflections[0].retry_number == 1 assert result.reflections[1].retry_number == 2 # --------------------------------------------------------------------------- # Test 3: Max reflections reached → returns best result found # --------------------------------------------------------------------------- class TestReflexionMaxReflectionsReached: """达到最大反思次数后返回最佳结果""" @pytest.mark.asyncio async def test_returns_best_result_when_max_reflections_reached(self): gateway = make_mock_gateway([ # Attempt 1 make_response(content="Poor answer"), make_response(content='```json\n{"score": 0.3}\n```'), make_response(content="Try harder"), # Attempt 2 make_response(content="Slightly better answer"), make_response(content='```json\n{"score": 0.5}\n```'), make_response(content="Still not good enough"), # Attempt 3 (max) make_response(content="Another answer"), make_response(content='```json\n{"score": 0.6}\n```'), ]) engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7, max_reflections=3) result = await engine.execute( messages=[{"role": "user", "content": "Hard task"}], ) # Should return the best result (score 0.6 from last attempt) assert result.evaluation_score == 0.6 assert result.reflection_count == 2 assert result.output == "Another answer" @pytest.mark.asyncio async def test_returns_earlier_best_when_later_worse(self): """后续尝试分数更低时,返回之前最佳结果""" gateway = make_mock_gateway([ # Attempt 1: score 0.5 make_response(content="Decent answer"), make_response(content='```json\n{"score": 0.5}\n```'), make_response(content="Try to improve"), # Attempt 2: score 0.4 (worse) make_response(content="Worse answer"), make_response(content='```json\n{"score": 0.4}\n```'), make_response(content="Still trying"), # Attempt 3: score 0.45 (still worse than attempt 1) make_response(content="Another answer"), make_response(content='```json\n{"score": 0.45}\n```'), # Reflection for attempt 3 (consumed but loop ends) make_response(content="Final reflection"), ]) engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7, max_reflections=3) result = await engine.execute( messages=[{"role": "user", "content": "Task"}], ) # Best score was 0.5 from attempt 1 assert result.evaluation_score == 0.5 assert result.output == "Decent answer" # --------------------------------------------------------------------------- # Test 4: Reflection text improves subsequent attempts # --------------------------------------------------------------------------- class TestReflexionReflectionImprovesAttempts: """反思文本改善后续尝试""" @pytest.mark.asyncio async def test_reflection_injected_into_system_prompt(self): """反思文本被注入到下一次 ReAct 的 system prompt 中""" gateway = make_mock_gateway([ make_response(content="Poor answer"), make_response(content='```json\n{"score": 0.3}\n```'), make_response(content="You need to provide more specific details."), make_response(content="Better answer with details"), make_response(content='```json\n{"score": 0.9}\n```'), ]) engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7) result = await engine.execute( messages=[{"role": "user", "content": "Task"}], system_prompt="You are a helpful assistant", ) assert result.reflection_count == 1 assert result.evaluation_score == 0.9 assert result.output == "Better answer with details" # Verify the reflection was recorded with correct metadata assert result.reflections[0].reflection_text == "You need to provide more specific details." assert result.reflections[0].score_before == 0.3 @pytest.mark.asyncio async def test_reflexion_composes_react_engine(self): """ReflexionEngine 组合(而非继承)ReActEngine""" gateway = make_mock_gateway([ make_response(content="Answer"), make_response(content='```json\n{"score": 0.9}\n```'), ]) engine = ReflexionEngine(llm_gateway=gateway) assert hasattr(engine, "_react_engine") assert isinstance(engine._react_engine, ReActEngine) assert not isinstance(engine, ReActEngine) @pytest.mark.asyncio async def test_reflexion_result_has_all_fields(self): """ReflexionResult 包含所有必要字段""" gateway = make_mock_gateway([ make_response(content="Answer"), make_response(content='```json\n{"score": 0.85}\n```'), ]) engine = ReflexionEngine(llm_gateway=gateway) result = await engine.execute( messages=[{"role": "user", "content": "Task"}], ) # ReActResult fields assert hasattr(result, "output") assert hasattr(result, "trajectory") assert hasattr(result, "total_steps") assert hasattr(result, "total_tokens") assert hasattr(result, "status") # ReflexionResult additional fields assert hasattr(result, "evaluation_score") assert hasattr(result, "reflection_count") assert hasattr(result, "reflections") # All trajectory steps are ReActStep assert all(isinstance(step, ReActStep) for step in result.trajectory)