fischer-agentkit/tests/integration/test_reflexion_loop.py

"""集成测试 - Reflexion 多轮循环

测试 ReflexionEngine 的 Evaluate→Reflect→Retry 循环。
仅 mock LLMGateway（外部 API），使用真实 ReflexionEngine 实例。
"""

from __future__ import annotations

from unittest.mock import AsyncMock, MagicMock

import pytest

from agentkit.core.react import ReActEngine, ReActStep
from agentkit.core.reflexion import ReflexionEngine, ReflexionReflection, ReflexionResult
from agentkit.llm.gateway import LLMGateway
from agentkit.llm.protocol import LLMResponse, TokenUsage


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def make_response(
    content: str = "",
    prompt_tokens: int = 10,
    completion_tokens: int = 20,
) -> LLMResponse:
    return LLMResponse(
        content=content,
        model="test-model",
        usage=TokenUsage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
        ),
    )


def make_mock_gateway(responses: list[LLMResponse]) -> MagicMock:
    gateway = MagicMock(spec=LLMGateway)
    gateway.chat = AsyncMock(side_effect=responses)
    return gateway


# ---------------------------------------------------------------------------
# Test 1: First attempt scores high → no retry, returns result
# ---------------------------------------------------------------------------


class TestReflexionFirstAttemptPasses:
    """首次尝试分数高于阈值，无需重试"""

    @pytest.mark.asyncio
    async def test_high_score_no_retry(self):
        gateway = make_mock_gateway([
            # ReAct call
            make_response(content="The answer is 42"),
            # Evaluation call - high score
            make_response(content='```json\n{"score": 0.9, "reasoning": "Excellent"}\n```'),
        ])
        engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)

        result = await engine.execute(
            messages=[{"role": "user", "content": "What is the answer?"}],
        )

        assert isinstance(result, ReflexionResult)
        assert result.output == "The answer is 42"
        assert result.evaluation_score == 0.9
        assert result.reflection_count == 0
        assert len(result.reflections) == 0
        assert result.status == "success"

    @pytest.mark.asyncio
    async def test_score_exactly_at_threshold(self):
        """分数恰好等于阈值，无需重试"""
        gateway = make_mock_gateway([
            make_response(content="Answer"),
            make_response(content='```json\n{"score": 0.7, "reasoning": "OK"}\n```'),
        ])
        engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)

        result = await engine.execute(
            messages=[{"role": "user", "content": "Task"}],
        )

        assert result.evaluation_score == 0.7
        assert result.reflection_count == 0


# ---------------------------------------------------------------------------
# Test 2: First attempt scores low, second scores high → returns best result
# ---------------------------------------------------------------------------


class TestReflexionRetryImprovesScore:
    """首次低分，反思后重试高分"""

    @pytest.mark.asyncio
    async def test_reflection_and_retry_on_low_score(self):
        gateway = make_mock_gateway([
            # 1st ReAct call
            make_response(content="Initial poor answer"),
            # 1st Evaluation - low score
            make_response(content='```json\n{"score": 0.3, "reasoning": "Incomplete"}\n```'),
            # 1st Reflection call
            make_response(content="You need to be more specific and provide detailed analysis."),
            # 2nd ReAct call
            make_response(content="Improved detailed answer"),
            # 2nd Evaluation - high score
            make_response(content='```json\n{"score": 0.85, "reasoning": "Good improvement"}\n```'),
        ])
        engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)

        result = await engine.execute(
            messages=[{"role": "user", "content": "Analyze this"}],
        )

        assert result.output == "Improved detailed answer"
        assert result.evaluation_score == 0.85
        assert result.reflection_count == 1
        assert len(result.reflections) == 1
        assert result.reflections[0].score_before == 0.3
        assert result.reflections[0].retry_number == 1
        assert "specific" in result.reflections[0].reflection_text.lower() or "detailed" in result.reflections[0].reflection_text.lower()

    @pytest.mark.asyncio
    async def test_multiple_retries_improve_score(self):
        """多次重试后分数逐步提升"""
        gateway = make_mock_gateway([
            # Attempt 1
            make_response(content="Bad answer"),
            make_response(content='```json\n{"score": 0.2}\n```'),
            make_response(content="Need more depth"),
            # Attempt 2
            make_response(content="Better answer"),
            make_response(content='```json\n{"score": 0.5}\n```'),
            make_response(content="Still needs improvement"),
            # Attempt 3
            make_response(content="Great answer"),
            make_response(content='```json\n{"score": 0.9}\n```'),
        ])
        engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7, max_reflections=3)

        result = await engine.execute(
            messages=[{"role": "user", "content": "Complex task"}],
        )

        assert result.output == "Great answer"
        assert result.evaluation_score == 0.9
        assert result.reflection_count == 2
        assert len(result.reflections) == 2
        assert result.reflections[0].retry_number == 1
        assert result.reflections[1].retry_number == 2


# ---------------------------------------------------------------------------
# Test 3: Max reflections reached → returns best result found
# ---------------------------------------------------------------------------


class TestReflexionMaxReflectionsReached:
    """达到最大反思次数后返回最佳结果"""

    @pytest.mark.asyncio
    async def test_returns_best_result_when_max_reflections_reached(self):
        gateway = make_mock_gateway([
            # Attempt 1
            make_response(content="Poor answer"),
            make_response(content='```json\n{"score": 0.3}\n```'),
            make_response(content="Try harder"),
            # Attempt 2
            make_response(content="Slightly better answer"),
            make_response(content='```json\n{"score": 0.5}\n```'),
            make_response(content="Still not good enough"),
            # Attempt 3 (max)
            make_response(content="Another answer"),
            make_response(content='```json\n{"score": 0.6}\n```'),
        ])
        engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7, max_reflections=3)

        result = await engine.execute(
            messages=[{"role": "user", "content": "Hard task"}],
        )

        # Should return the best result (score 0.6 from last attempt)
        assert result.evaluation_score == 0.6
        assert result.reflection_count == 2
        assert result.output == "Another answer"

    @pytest.mark.asyncio
    async def test_returns_earlier_best_when_later_worse(self):
        """后续尝试分数更低时，返回之前最佳结果"""
        gateway = make_mock_gateway([
            # Attempt 1: score 0.5
            make_response(content="Decent answer"),
            make_response(content='```json\n{"score": 0.5}\n```'),
            make_response(content="Try to improve"),
            # Attempt 2: score 0.4 (worse)
            make_response(content="Worse answer"),
            make_response(content='```json\n{"score": 0.4}\n```'),
            make_response(content="Still trying"),
            # Attempt 3: score 0.45 (still worse than attempt 1)
            make_response(content="Another answer"),
            make_response(content='```json\n{"score": 0.45}\n```'),
            # Reflection for attempt 3 (consumed but loop ends)
            make_response(content="Final reflection"),
        ])
        engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7, max_reflections=3)

        result = await engine.execute(
            messages=[{"role": "user", "content": "Task"}],
        )

        # Best score was 0.5 from attempt 1
        assert result.evaluation_score == 0.5
        assert result.output == "Decent answer"


# ---------------------------------------------------------------------------
# Test 4: Reflection text improves subsequent attempts
# ---------------------------------------------------------------------------


class TestReflexionReflectionImprovesAttempts:
    """反思文本改善后续尝试"""

    @pytest.mark.asyncio
    async def test_reflection_injected_into_system_prompt(self):
        """反思文本被注入到下一次 ReAct 的 system prompt 中"""
        gateway = make_mock_gateway([
            make_response(content="Poor answer"),
            make_response(content='```json\n{"score": 0.3}\n```'),
            make_response(content="You need to provide more specific details."),
            make_response(content="Better answer with details"),
            make_response(content='```json\n{"score": 0.9}\n```'),
        ])
        engine = ReflexionEngine(llm_gateway=gateway, quality_threshold=0.7)

        result = await engine.execute(
            messages=[{"role": "user", "content": "Task"}],
            system_prompt="You are a helpful assistant",
        )

        assert result.reflection_count == 1
        assert result.evaluation_score == 0.9
        assert result.output == "Better answer with details"

        # Verify the reflection was recorded with correct metadata
        assert result.reflections[0].reflection_text == "You need to provide more specific details."
        assert result.reflections[0].score_before == 0.3

    @pytest.mark.asyncio
    async def test_reflexion_composes_react_engine(self):
        """ReflexionEngine 组合（而非继承）ReActEngine"""
        gateway = make_mock_gateway([
            make_response(content="Answer"),
            make_response(content='```json\n{"score": 0.9}\n```'),
        ])
        engine = ReflexionEngine(llm_gateway=gateway)

        assert hasattr(engine, "_react_engine")
        assert isinstance(engine._react_engine, ReActEngine)
        assert not isinstance(engine, ReActEngine)

    @pytest.mark.asyncio
    async def test_reflexion_result_has_all_fields(self):
        """ReflexionResult 包含所有必要字段"""
        gateway = make_mock_gateway([
            make_response(content="Answer"),
            make_response(content='```json\n{"score": 0.85}\n```'),
        ])
        engine = ReflexionEngine(llm_gateway=gateway)

        result = await engine.execute(
            messages=[{"role": "user", "content": "Task"}],
        )

        # ReActResult fields
        assert hasattr(result, "output")
        assert hasattr(result, "trajectory")
        assert hasattr(result, "total_steps")
        assert hasattr(result, "total_tokens")
        assert hasattr(result, "status")

        # ReflexionResult additional fields
        assert hasattr(result, "evaluation_score")
        assert hasattr(result, "reflection_count")
        assert hasattr(result, "reflections")

        # All trajectory steps are ReActStep
        assert all(isinstance(step, ReActStep) for step in result.trajectory)