"""Tests for EvolutionMixin - 进化引擎与 Agent 生命周期集成""" import pytest from agentkit.core.protocol import TaskMessage, TaskResult, TaskStatus from agentkit.evolution.ab_tester import ABTestConfig, ABTestResult, ABTester from agentkit.evolution.evolution_store import InMemoryEvolutionStore from agentkit.evolution.lifecycle import EvolutionLogEntry, EvolutionMixin from agentkit.evolution.prompt_optimizer import Module, PromptOptimizer, Signature from agentkit.evolution.reflector import Reflection, Reflector from agentkit.evolution.strategy_tuner import StrategyConfig, StrategyTuner from datetime import datetime, timezone def _make_task(task_id: str = "test-001") -> TaskMessage: return TaskMessage( task_id=task_id, agent_name="evolving_agent", task_type="echo", priority=0, input_data={"query": "hello"}, callback_url=None, created_at=datetime.now(timezone.utc), ) def _make_result(status: str = TaskStatus.COMPLETED) -> TaskResult: return TaskResult( task_id="test-001", agent_name="evolving_agent", status=status, output_data={"key": "value"}, error_message=None, started_at=datetime.now(timezone.utc), completed_at=datetime.now(timezone.utc), metrics={"elapsed_seconds": 5.0}, ) def _make_module() -> Module: return Module( name="test_module", signature=Signature( input_fields={"query": "search query"}, output_fields={"result": "search result"}, instruction="Find the best result.", ), ) # ── EvolutionMixin 与 Agent on_task_complete 集成 ────────────── class EvolvingAgent(EvolutionMixin): """模拟集成了 EvolutionMixin 的 Agent""" def __init__(self, reflector=None, prompt_optimizer=None, ab_tester=None, evolution_store=None, strategy_tuner=None, strategy_tuning_enabled=False): super().__init__( reflector=reflector, prompt_optimizer=prompt_optimizer, ab_tester=ab_tester, evolution_store=evolution_store, strategy_tuner=strategy_tuner, strategy_tuning_enabled=strategy_tuning_enabled, ) self.name = "evolving_agent" self.evolve_called = False async def on_task_complete(self, task: TaskMessage, output: dict) -> None: """任务完成后触发进化""" result = _make_result() await self.evolve_after_task(task, result) self.evolve_called = True @pytest.mark.asyncio async def test_mixin_integrates_with_on_task_complete(): """EvolutionMixin 与 Agent 的 on_task_complete 集成""" reflector = Reflector() agent = EvolvingAgent(reflector=reflector) agent.set_current_module(_make_module()) task = _make_task() await agent.on_task_complete(task, {"key": "value"}) assert agent.evolve_called is True history = agent.get_evolution_history() assert len(history) == 1 assert history[0]["task_id"] == "test-001" # ── Reflector 生成反思 ────────────────────────────────────── @pytest.mark.asyncio async def test_reflector_generates_reflection_after_task(): """Reflector 在任务完成后生成反思""" reflector = Reflector() mixin = EvolutionMixin(reflector=reflector) mixin.set_current_module(_make_module()) task = _make_task() result = _make_result() entry = await mixin.evolve_after_task(task, result) assert entry.reflection is not None assert entry.reflection.outcome == "success" assert entry.reflection.quality_score > 0 # ── Prompt 优化在有改进建议时触发 ────────────────────────────── class LowQualityReflector(Reflector): """总是产生低质量结果和改进建议的 Reflector""" async def reflect(self, task, result): return Reflection( task_id=task.task_id, agent_name=result.agent_name, outcome="failure", quality_score=0.2, patterns=["slow_execution"], insights=["Low quality score indicates potential issues"], suggestions=["Consider prompt optimization for this task type"], ) @pytest.mark.asyncio async def test_prompt_optimization_triggered_when_reflection_suggests_improvement(): """当反思建议改进时,触发 Prompt 优化""" reflector = LowQualityReflector() optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1) # 预填充足够的成功样本以触发优化 for i in range(3): optimizer.add_example( input_data={"query": f"q_{i}"}, output_data={"result": f"r_{i}"}, quality_score=0.9, ) mixin = EvolutionMixin(reflector=reflector, prompt_optimizer=optimizer) module = _make_module() mixin.set_current_module(module) task = _make_task() result = _make_result() entry = await mixin.evolve_after_task(task, result) assert entry.reflection is not None assert len(entry.reflection.suggestions) > 0 assert entry.optimized_module is not None assert entry.optimized_module.name == "test_module_optimized" @pytest.mark.asyncio async def test_no_optimization_when_no_suggestions(): """当反思没有改进建议时,不触发优化""" # 默认 Reflector 对成功任务不会产生建议 reflector = Reflector() mixin = EvolutionMixin(reflector=reflector, prompt_optimizer=PromptOptimizer()) mixin.set_current_module(_make_module()) task = _make_task() result = _make_result() entry = await mixin.evolve_after_task(task, result) assert entry.reflection is not None assert entry.optimized_module is None # ── AB 测试验证 ────────────────────────────────────────────── class SucceedingABTester(ABTester): """总是让实验组获胜的 AB 测试器""" async def evaluate(self, test_id: str) -> ABTestResult | None: return ABTestResult( test_id=test_id, control_metric=0.5, experiment_metric=0.8, control_samples=10, experiment_samples=10, is_significant=True, winner="experiment", p_value=0.01, ) class FailingABTester(ABTester): """总是让对照组获胜的 AB 测试器""" async def evaluate(self, test_id: str) -> ABTestResult | None: return ABTestResult( test_id=test_id, control_metric=0.8, experiment_metric=0.5, control_samples=10, experiment_samples=10, is_significant=True, winner="control", p_value=0.01, ) class InconclusiveABTester(ABTester): """总是返回不显著结果的 AB 测试器""" async def evaluate(self, test_id: str) -> ABTestResult | None: return ABTestResult( test_id=test_id, control_metric=0.5, experiment_metric=0.52, control_samples=10, experiment_samples=10, is_significant=False, winner=None, p_value=0.8, ) @pytest.mark.asyncio async def test_ab_test_significant_treatment_wins(): """A/B 测试显著且实验组获胜时应用变更""" reflector = LowQualityReflector() optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1) for i in range(3): optimizer.add_example( input_data={"query": f"q_{i}"}, output_data={"result": f"r_{i}"}, quality_score=0.9, ) ab_tester = SucceedingABTester() mixin = EvolutionMixin( reflector=reflector, prompt_optimizer=optimizer, ab_tester=ab_tester, ) mixin.set_current_module(_make_module()) task = _make_task() result = _make_result() entry = await mixin.evolve_after_task(task, result) assert entry.ab_test_result is not None assert entry.ab_test_result.is_significant is True assert entry.ab_test_result.winner == "experiment" assert entry.applied is True assert entry.rolled_back is False @pytest.mark.asyncio async def test_ab_test_significant_control_wins(): """A/B 测试显著且对照组获胜时回滚""" reflector = LowQualityReflector() optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1) for i in range(3): optimizer.add_example( input_data={"query": f"q_{i}"}, output_data={"result": f"r_{i}"}, quality_score=0.9, ) ab_tester = FailingABTester() mixin = EvolutionMixin( reflector=reflector, prompt_optimizer=optimizer, ab_tester=ab_tester, ) original_module = _make_module() mixin.set_current_module(original_module) task = _make_task() result = _make_result() entry = await mixin.evolve_after_task(task, result) assert entry.ab_test_result is not None assert entry.ab_test_result.is_significant is True assert entry.ab_test_result.winner == "control" assert entry.rolled_back is True assert entry.applied is False # 模块不应被更新 assert mixin._current_module.name == "test_module" @pytest.mark.asyncio async def test_ab_test_inconclusive_keeps_current(): """A/B 测试不显著时保持当前 prompt""" reflector = LowQualityReflector() optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1) for i in range(3): optimizer.add_example( input_data={"query": f"q_{i}"}, output_data={"result": f"r_{i}"}, quality_score=0.9, ) ab_tester = InconclusiveABTester() mixin = EvolutionMixin( reflector=reflector, prompt_optimizer=optimizer, ab_tester=ab_tester, ) original_module = _make_module() mixin.set_current_module(original_module) task = _make_task() result = _make_result() entry = await mixin.evolve_after_task(task, result) assert entry.ab_test_result is not None assert entry.ab_test_result.is_significant is False assert entry.applied is False assert entry.rolled_back is False # Module stays the same assert mixin._current_module.name == "test_module" # ── 进化历史记录 ────────────────────────────────────────────── @pytest.mark.asyncio async def test_evolution_history_is_recorded(): """进化历史被正确记录""" reflector = Reflector() mixin = EvolutionMixin(reflector=reflector) mixin.set_current_module(_make_module()) task = _make_task() result = _make_result() await mixin.evolve_after_task(task, result) history = mixin.get_evolution_history() assert len(history) == 1 assert history[0]["task_id"] == "test-001" assert "reflection" in history[0] assert history[0]["reflection"]["outcome"] == "success" @pytest.mark.asyncio async def test_evolution_history_multiple_entries(): """多次进化产生多条历史记录""" reflector = Reflector() mixin = EvolutionMixin(reflector=reflector) mixin.set_current_module(_make_module()) for i in range(3): task = TaskMessage( task_id=f"test-{i:03d}", agent_name="evolving_agent", task_type="echo", priority=0, input_data={"query": f"hello_{i}"}, callback_url=None, created_at=datetime.now(timezone.utc), ) result = TaskResult( task_id=f"test-{i:03d}", agent_name="evolving_agent", status=TaskStatus.COMPLETED, output_data={"key": "value"}, error_message=None, started_at=datetime.now(timezone.utc), completed_at=datetime.now(timezone.utc), metrics={"elapsed_seconds": 5.0}, ) await mixin.evolve_after_task(task, result) history = mixin.get_evolution_history() assert len(history) == 3 assert history[0]["task_id"] == "test-000" assert history[1]["task_id"] == "test-001" assert history[2]["task_id"] == "test-002" # ── 无组件配置时的优雅降级 ────────────────────────────────────── @pytest.mark.asyncio async def test_no_reflector_skips_evolution(): """没有 Reflector 时跳过进化""" mixin = EvolutionMixin() mixin.set_current_module(_make_module()) task = _make_task() result = _make_result() entry = await mixin.evolve_after_task(task, result) assert entry.reflection is None assert entry.applied is False @pytest.mark.asyncio async def test_no_evolution_store_applies_directly(): """没有 EvolutionStore 时直接在内存中应用变更""" reflector = LowQualityReflector() optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1) for i in range(3): optimizer.add_example( input_data={"query": f"q_{i}"}, output_data={"result": f"r_{i}"}, quality_score=0.9, ) mixin = EvolutionMixin(reflector=reflector, prompt_optimizer=optimizer) mixin.set_current_module(_make_module()) task = _make_task() result = _make_result() entry = await mixin.evolve_after_task(task, result) # 没有 AB tester,也没有 store,直接应用 assert entry.applied is True assert mixin._current_module.name == "test_module_optimized" # ── Strategy Tuning 集成 ────────────────────────────────────── @pytest.mark.asyncio async def test_strategy_tuning_called_when_enabled(): """策略调优启用时在进化流程中被调用""" reflector = LowQualityReflector() optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1) for i in range(3): optimizer.add_example( input_data={"query": f"q_{i}"}, output_data={"result": f"r_{i}"}, quality_score=0.9, ) tuner = StrategyTuner() # Pre-fill tuner history so suggest() doesn't return current for i in range(5): tuner.record(StrategyConfig(temperature=0.5, max_iterations=5), 0.3 + i * 0.1) mixin = EvolutionMixin( reflector=reflector, prompt_optimizer=optimizer, strategy_tuner=tuner, strategy_tuning_enabled=True, ) mixin.set_current_module(_make_module()) task = _make_task() result = _make_result() entry = await mixin.evolve_after_task(task, result) # Strategy tuner should have been called and recorded the result assert len(tuner._history) >= 6 # 5 pre-filled + 1 from evolution @pytest.mark.asyncio async def test_strategy_tuning_not_called_when_disabled(): """策略调优未启用时不被调用""" reflector = LowQualityReflector() optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1) for i in range(3): optimizer.add_example( input_data={"query": f"q_{i}"}, output_data={"result": f"r_{i}"}, quality_score=0.9, ) tuner = StrategyTuner() mixin = EvolutionMixin( reflector=reflector, prompt_optimizer=optimizer, strategy_tuner=tuner, strategy_tuning_enabled=False, # Disabled ) mixin.set_current_module(_make_module()) task = _make_task() result = _make_result() entry = await mixin.evolve_after_task(task, result) # Strategy tuner should NOT have been called assert len(tuner._history) == 0 # ── End-to-end: reflect → optimize → A/B test → apply/rollback ────────── @pytest.mark.asyncio async def test_end_to_end_evolution_with_ab_test(): """端到端测试:反思 → 优化 → A/B 测试 → 应用""" reflector = LowQualityReflector() optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1) for i in range(3): optimizer.add_example( input_data={"query": f"q_{i}"}, output_data={"result": f"r_{i}"}, quality_score=0.9, ) store = InMemoryEvolutionStore() ab_tester = SucceedingABTester(evolution_store=store, min_samples=10) mixin = EvolutionMixin( reflector=reflector, prompt_optimizer=optimizer, ab_tester=ab_tester, evolution_store=store, ) mixin.set_current_module(_make_module()) task = _make_task() result = _make_result() entry = await mixin.evolve_after_task(task, result) # Full pipeline: reflected → optimized → A/B tested → applied assert entry.reflection is not None assert entry.optimized_module is not None assert entry.ab_test_result is not None assert entry.applied is True assert mixin._current_module.name == "test_module_optimized"