fischer-agentkit/tests/unit/test_evolution_lifecycle.py

521 lines
17 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Tests for EvolutionMixin - 进化引擎与 Agent 生命周期集成"""
import pytest
from agentkit.core.protocol import TaskMessage, TaskResult, TaskStatus
from agentkit.evolution.ab_tester import ABTestConfig, ABTestResult, ABTester
from agentkit.evolution.evolution_store import InMemoryEvolutionStore
from agentkit.evolution.lifecycle import EvolutionLogEntry, EvolutionMixin
from agentkit.evolution.prompt_optimizer import Module, PromptOptimizer, Signature
from agentkit.evolution.reflector import Reflection, Reflector
from agentkit.evolution.strategy_tuner import StrategyConfig, StrategyTuner
from datetime import datetime, timezone
def _make_task(task_id: str = "test-001") -> TaskMessage:
return TaskMessage(
task_id=task_id,
agent_name="evolving_agent",
task_type="echo",
priority=0,
input_data={"query": "hello"},
callback_url=None,
created_at=datetime.now(timezone.utc),
)
def _make_result(status: str = TaskStatus.COMPLETED) -> TaskResult:
return TaskResult(
task_id="test-001",
agent_name="evolving_agent",
status=status,
output_data={"key": "value"},
error_message=None,
started_at=datetime.now(timezone.utc),
completed_at=datetime.now(timezone.utc),
metrics={"elapsed_seconds": 5.0},
)
def _make_module() -> Module:
return Module(
name="test_module",
signature=Signature(
input_fields={"query": "search query"},
output_fields={"result": "search result"},
instruction="Find the best result.",
),
)
# ── EvolutionMixin 与 Agent on_task_complete 集成 ──────────────
class EvolvingAgent(EvolutionMixin):
"""模拟集成了 EvolutionMixin 的 Agent"""
def __init__(self, reflector=None, prompt_optimizer=None, ab_tester=None, evolution_store=None,
strategy_tuner=None, strategy_tuning_enabled=False):
super().__init__(
reflector=reflector,
prompt_optimizer=prompt_optimizer,
ab_tester=ab_tester,
evolution_store=evolution_store,
strategy_tuner=strategy_tuner,
strategy_tuning_enabled=strategy_tuning_enabled,
)
self.name = "evolving_agent"
self.evolve_called = False
async def on_task_complete(self, task: TaskMessage, output: dict) -> None:
"""任务完成后触发进化"""
result = _make_result()
await self.evolve_after_task(task, result)
self.evolve_called = True
@pytest.mark.asyncio
async def test_mixin_integrates_with_on_task_complete():
"""EvolutionMixin 与 Agent 的 on_task_complete 集成"""
reflector = Reflector()
agent = EvolvingAgent(reflector=reflector)
agent.set_current_module(_make_module())
task = _make_task()
await agent.on_task_complete(task, {"key": "value"})
assert agent.evolve_called is True
history = agent.get_evolution_history()
assert len(history) == 1
assert history[0]["task_id"] == "test-001"
# ── Reflector 生成反思 ──────────────────────────────────────
@pytest.mark.asyncio
async def test_reflector_generates_reflection_after_task():
"""Reflector 在任务完成后生成反思"""
reflector = Reflector()
mixin = EvolutionMixin(reflector=reflector)
mixin.set_current_module(_make_module())
task = _make_task()
result = _make_result()
entry = await mixin.evolve_after_task(task, result)
assert entry.reflection is not None
assert entry.reflection.outcome == "success"
assert entry.reflection.quality_score > 0
# ── Prompt 优化在有改进建议时触发 ──────────────────────────────
class LowQualityReflector(Reflector):
"""总是产生低质量结果和改进建议的 Reflector"""
async def reflect(self, task, result):
return Reflection(
task_id=task.task_id,
agent_name=result.agent_name,
outcome="failure",
quality_score=0.2,
patterns=["slow_execution"],
insights=["Low quality score indicates potential issues"],
suggestions=["Consider prompt optimization for this task type"],
)
@pytest.mark.asyncio
async def test_prompt_optimization_triggered_when_reflection_suggests_improvement():
"""当反思建议改进时,触发 Prompt 优化"""
reflector = LowQualityReflector()
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1)
# 预填充足够的成功样本以触发优化
for i in range(3):
optimizer.add_example(
input_data={"query": f"q_{i}"},
output_data={"result": f"r_{i}"},
quality_score=0.9,
)
mixin = EvolutionMixin(reflector=reflector, prompt_optimizer=optimizer)
module = _make_module()
mixin.set_current_module(module)
task = _make_task()
result = _make_result()
entry = await mixin.evolve_after_task(task, result)
assert entry.reflection is not None
assert len(entry.reflection.suggestions) > 0
assert entry.optimized_module is not None
assert entry.optimized_module.name == "test_module_optimized"
@pytest.mark.asyncio
async def test_no_optimization_when_no_suggestions():
"""当反思没有改进建议时,不触发优化"""
# 默认 Reflector 对成功任务不会产生建议
reflector = Reflector()
mixin = EvolutionMixin(reflector=reflector, prompt_optimizer=PromptOptimizer())
mixin.set_current_module(_make_module())
task = _make_task()
result = _make_result()
entry = await mixin.evolve_after_task(task, result)
assert entry.reflection is not None
assert entry.optimized_module is None
# ── AB 测试验证 ──────────────────────────────────────────────
class SucceedingABTester(ABTester):
"""总是让实验组获胜的 AB 测试器"""
async def evaluate(self, test_id: str) -> ABTestResult | None:
return ABTestResult(
test_id=test_id,
control_metric=0.5,
experiment_metric=0.8,
control_samples=10,
experiment_samples=10,
is_significant=True,
winner="experiment",
p_value=0.01,
)
class FailingABTester(ABTester):
"""总是让对照组获胜的 AB 测试器"""
async def evaluate(self, test_id: str) -> ABTestResult | None:
return ABTestResult(
test_id=test_id,
control_metric=0.8,
experiment_metric=0.5,
control_samples=10,
experiment_samples=10,
is_significant=True,
winner="control",
p_value=0.01,
)
class InconclusiveABTester(ABTester):
"""总是返回不显著结果的 AB 测试器"""
async def evaluate(self, test_id: str) -> ABTestResult | None:
return ABTestResult(
test_id=test_id,
control_metric=0.5,
experiment_metric=0.52,
control_samples=10,
experiment_samples=10,
is_significant=False,
winner=None,
p_value=0.8,
)
@pytest.mark.asyncio
async def test_ab_test_significant_treatment_wins():
"""A/B 测试显著且实验组获胜时应用变更"""
reflector = LowQualityReflector()
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1)
for i in range(3):
optimizer.add_example(
input_data={"query": f"q_{i}"},
output_data={"result": f"r_{i}"},
quality_score=0.9,
)
ab_tester = SucceedingABTester()
mixin = EvolutionMixin(
reflector=reflector,
prompt_optimizer=optimizer,
ab_tester=ab_tester,
)
mixin.set_current_module(_make_module())
task = _make_task()
result = _make_result()
entry = await mixin.evolve_after_task(task, result)
assert entry.ab_test_result is not None
assert entry.ab_test_result.is_significant is True
assert entry.ab_test_result.winner == "experiment"
assert entry.applied is True
assert entry.rolled_back is False
@pytest.mark.asyncio
async def test_ab_test_significant_control_wins():
"""A/B 测试显著且对照组获胜时回滚"""
reflector = LowQualityReflector()
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1)
for i in range(3):
optimizer.add_example(
input_data={"query": f"q_{i}"},
output_data={"result": f"r_{i}"},
quality_score=0.9,
)
ab_tester = FailingABTester()
mixin = EvolutionMixin(
reflector=reflector,
prompt_optimizer=optimizer,
ab_tester=ab_tester,
)
original_module = _make_module()
mixin.set_current_module(original_module)
task = _make_task()
result = _make_result()
entry = await mixin.evolve_after_task(task, result)
assert entry.ab_test_result is not None
assert entry.ab_test_result.is_significant is True
assert entry.ab_test_result.winner == "control"
assert entry.rolled_back is True
assert entry.applied is False
# 模块不应被更新
assert mixin._current_module.name == "test_module"
@pytest.mark.asyncio
async def test_ab_test_inconclusive_keeps_current():
"""A/B 测试不显著时保持当前 prompt"""
reflector = LowQualityReflector()
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1)
for i in range(3):
optimizer.add_example(
input_data={"query": f"q_{i}"},
output_data={"result": f"r_{i}"},
quality_score=0.9,
)
ab_tester = InconclusiveABTester()
mixin = EvolutionMixin(
reflector=reflector,
prompt_optimizer=optimizer,
ab_tester=ab_tester,
)
original_module = _make_module()
mixin.set_current_module(original_module)
task = _make_task()
result = _make_result()
entry = await mixin.evolve_after_task(task, result)
assert entry.ab_test_result is not None
assert entry.ab_test_result.is_significant is False
assert entry.applied is False
assert entry.rolled_back is False
# Module stays the same
assert mixin._current_module.name == "test_module"
# ── 进化历史记录 ──────────────────────────────────────────────
@pytest.mark.asyncio
async def test_evolution_history_is_recorded():
"""进化历史被正确记录"""
reflector = Reflector()
mixin = EvolutionMixin(reflector=reflector)
mixin.set_current_module(_make_module())
task = _make_task()
result = _make_result()
await mixin.evolve_after_task(task, result)
history = mixin.get_evolution_history()
assert len(history) == 1
assert history[0]["task_id"] == "test-001"
assert "reflection" in history[0]
assert history[0]["reflection"]["outcome"] == "success"
@pytest.mark.asyncio
async def test_evolution_history_multiple_entries():
"""多次进化产生多条历史记录"""
reflector = Reflector()
mixin = EvolutionMixin(reflector=reflector)
mixin.set_current_module(_make_module())
for i in range(3):
task = TaskMessage(
task_id=f"test-{i:03d}",
agent_name="evolving_agent",
task_type="echo",
priority=0,
input_data={"query": f"hello_{i}"},
callback_url=None,
created_at=datetime.now(timezone.utc),
)
result = TaskResult(
task_id=f"test-{i:03d}",
agent_name="evolving_agent",
status=TaskStatus.COMPLETED,
output_data={"key": "value"},
error_message=None,
started_at=datetime.now(timezone.utc),
completed_at=datetime.now(timezone.utc),
metrics={"elapsed_seconds": 5.0},
)
await mixin.evolve_after_task(task, result)
history = mixin.get_evolution_history()
assert len(history) == 3
assert history[0]["task_id"] == "test-000"
assert history[1]["task_id"] == "test-001"
assert history[2]["task_id"] == "test-002"
# ── 无组件配置时的优雅降级 ──────────────────────────────────────
@pytest.mark.asyncio
async def test_no_reflector_skips_evolution():
"""没有 Reflector 时跳过进化"""
mixin = EvolutionMixin()
mixin.set_current_module(_make_module())
task = _make_task()
result = _make_result()
entry = await mixin.evolve_after_task(task, result)
assert entry.reflection is None
assert entry.applied is False
@pytest.mark.asyncio
async def test_no_evolution_store_applies_directly():
"""没有 EvolutionStore 时直接在内存中应用变更"""
reflector = LowQualityReflector()
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1)
for i in range(3):
optimizer.add_example(
input_data={"query": f"q_{i}"},
output_data={"result": f"r_{i}"},
quality_score=0.9,
)
mixin = EvolutionMixin(reflector=reflector, prompt_optimizer=optimizer)
mixin.set_current_module(_make_module())
task = _make_task()
result = _make_result()
entry = await mixin.evolve_after_task(task, result)
# 没有 AB tester也没有 store直接应用
assert entry.applied is True
assert mixin._current_module.name == "test_module_optimized"
# ── Strategy Tuning 集成 ──────────────────────────────────────
@pytest.mark.asyncio
async def test_strategy_tuning_called_when_enabled():
"""策略调优启用时在进化流程中被调用"""
reflector = LowQualityReflector()
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1)
for i in range(3):
optimizer.add_example(
input_data={"query": f"q_{i}"},
output_data={"result": f"r_{i}"},
quality_score=0.9,
)
tuner = StrategyTuner()
# Pre-fill tuner history so suggest() doesn't return current
for i in range(5):
tuner.record(StrategyConfig(temperature=0.5, max_iterations=5), 0.3 + i * 0.1)
mixin = EvolutionMixin(
reflector=reflector,
prompt_optimizer=optimizer,
strategy_tuner=tuner,
strategy_tuning_enabled=True,
)
mixin.set_current_module(_make_module())
task = _make_task()
result = _make_result()
entry = await mixin.evolve_after_task(task, result)
# Strategy tuner should have been called and recorded the result
assert len(tuner._history) >= 6 # 5 pre-filled + 1 from evolution
@pytest.mark.asyncio
async def test_strategy_tuning_not_called_when_disabled():
"""策略调优未启用时不被调用"""
reflector = LowQualityReflector()
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1)
for i in range(3):
optimizer.add_example(
input_data={"query": f"q_{i}"},
output_data={"result": f"r_{i}"},
quality_score=0.9,
)
tuner = StrategyTuner()
mixin = EvolutionMixin(
reflector=reflector,
prompt_optimizer=optimizer,
strategy_tuner=tuner,
strategy_tuning_enabled=False, # Disabled
)
mixin.set_current_module(_make_module())
task = _make_task()
result = _make_result()
entry = await mixin.evolve_after_task(task, result)
# Strategy tuner should NOT have been called
assert len(tuner._history) == 0
# ── End-to-end: reflect → optimize → A/B test → apply/rollback ──────────
@pytest.mark.asyncio
async def test_end_to_end_evolution_with_ab_test():
"""端到端测试:反思 → 优化 → A/B 测试 → 应用"""
reflector = LowQualityReflector()
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1)
for i in range(3):
optimizer.add_example(
input_data={"query": f"q_{i}"},
output_data={"result": f"r_{i}"},
quality_score=0.9,
)
store = InMemoryEvolutionStore()
ab_tester = SucceedingABTester(evolution_store=store, min_samples=10)
mixin = EvolutionMixin(
reflector=reflector,
prompt_optimizer=optimizer,
ab_tester=ab_tester,
evolution_store=store,
)
mixin.set_current_module(_make_module())
task = _make_task()
result = _make_result()
entry = await mixin.evolve_after_task(task, result)
# Full pipeline: reflected → optimized → A/B tested → applied
assert entry.reflection is not None
assert entry.optimized_module is not None
assert entry.ab_test_result is not None
assert entry.applied is True
assert mixin._current_module.name == "test_module_optimized"