fischer-agentkit/tests/unit/test_evolution_lifecycle.py

348 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Tests for EvolutionMixin - 进化引擎与 Agent 生命周期集成"""
import pytest
from agentkit.core.protocol import TaskMessage, TaskResult, TaskStatus
from agentkit.evolution.ab_tester import ABTestConfig, ABTestResult, ABTester
from agentkit.evolution.evolution_store import EvolutionStore
from agentkit.evolution.lifecycle import EvolutionLogEntry, EvolutionMixin
from agentkit.evolution.prompt_optimizer import Module, PromptOptimizer, Signature
from agentkit.evolution.reflector import Reflection, Reflector
from agentkit.evolution.strategy_tuner import StrategyConfig, StrategyTuner
from datetime import datetime, timezone
def _make_task() -> TaskMessage:
return TaskMessage(
task_id="test-001",
agent_name="evolving_agent",
task_type="echo",
priority=0,
input_data={"query": "hello"},
callback_url=None,
created_at=datetime.now(timezone.utc),
)
def _make_result(status: str = TaskStatus.COMPLETED) -> TaskResult:
return TaskResult(
task_id="test-001",
agent_name="evolving_agent",
status=status,
output_data={"key": "value"},
error_message=None,
started_at=datetime.now(timezone.utc),
completed_at=datetime.now(timezone.utc),
metrics={"elapsed_seconds": 5.0},
)
def _make_module() -> Module:
return Module(
name="test_module",
signature=Signature(
input_fields={"query": "search query"},
output_fields={"result": "search result"},
instruction="Find the best result.",
),
)
# ── EvolutionMixin 与 Agent on_task_complete 集成 ──────────────
class EvolvingAgent(EvolutionMixin):
"""模拟集成了 EvolutionMixin 的 Agent"""
def __init__(self, reflector=None, prompt_optimizer=None, ab_tester=None, evolution_store=None):
super().__init__(
reflector=reflector,
prompt_optimizer=prompt_optimizer,
ab_tester=ab_tester,
evolution_store=evolution_store,
)
self.name = "evolving_agent"
self.evolve_called = False
async def on_task_complete(self, task: TaskMessage, output: dict) -> None:
"""任务完成后触发进化"""
result = _make_result()
await self.evolve_after_task(task, result)
self.evolve_called = True
@pytest.mark.asyncio
async def test_mixin_integrates_with_on_task_complete():
"""EvolutionMixin 与 Agent 的 on_task_complete 集成"""
reflector = Reflector()
agent = EvolvingAgent(reflector=reflector)
agent.set_current_module(_make_module())
task = _make_task()
await agent.on_task_complete(task, {"key": "value"})
assert agent.evolve_called is True
history = agent.get_evolution_history()
assert len(history) == 1
assert history[0]["task_id"] == "test-001"
# ── Reflector 生成反思 ──────────────────────────────────────
@pytest.mark.asyncio
async def test_reflector_generates_reflection_after_task():
"""Reflector 在任务完成后生成反思"""
reflector = Reflector()
mixin = EvolutionMixin(reflector=reflector)
mixin.set_current_module(_make_module())
task = _make_task()
result = _make_result()
entry = await mixin.evolve_after_task(task, result)
assert entry.reflection is not None
assert entry.reflection.outcome == "success"
assert entry.reflection.quality_score > 0
# ── Prompt 优化在有改进建议时触发 ──────────────────────────────
class LowQualityReflector(Reflector):
"""总是产生低质量结果和改进建议的 Reflector"""
async def reflect(self, task, result):
return Reflection(
task_id=task.task_id,
agent_name=result.agent_name,
outcome="failure",
quality_score=0.2,
patterns=["slow_execution"],
insights=["Low quality score indicates potential issues"],
suggestions=["Consider prompt optimization for this task type"],
)
@pytest.mark.asyncio
async def test_prompt_optimization_triggered_when_reflection_suggests_improvement():
"""当反思建议改进时,触发 Prompt 优化"""
reflector = LowQualityReflector()
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1)
# 预填充足够的成功样本以触发优化
for i in range(3):
optimizer.add_example(
input_data={"query": f"q_{i}"},
output_data={"result": f"r_{i}"},
quality_score=0.9,
)
mixin = EvolutionMixin(reflector=reflector, prompt_optimizer=optimizer)
module = _make_module()
mixin.set_current_module(module)
task = _make_task()
result = _make_result()
entry = await mixin.evolve_after_task(task, result)
assert entry.reflection is not None
assert len(entry.reflection.suggestions) > 0
assert entry.optimized_module is not None
assert entry.optimized_module.name == "test_module_optimized"
@pytest.mark.asyncio
async def test_no_optimization_when_no_suggestions():
"""当反思没有改进建议时,不触发优化"""
# 默认 Reflector 对成功任务不会产生建议
reflector = Reflector()
mixin = EvolutionMixin(reflector=reflector, prompt_optimizer=PromptOptimizer())
mixin.set_current_module(_make_module())
task = _make_task()
result = _make_result()
entry = await mixin.evolve_after_task(task, result)
assert entry.reflection is not None
assert entry.optimized_module is None
# ── AB 测试验证 ──────────────────────────────────────────────
@pytest.mark.asyncio
async def test_ab_test_validation_before_applying():
"""AB 测试在应用变更前进行验证"""
reflector = LowQualityReflector()
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1)
for i in range(3):
optimizer.add_example(
input_data={"query": f"q_{i}"},
output_data={"result": f"r_{i}"},
quality_score=0.9,
)
ab_tester = ABTester()
mixin = EvolutionMixin(
reflector=reflector,
prompt_optimizer=optimizer,
ab_tester=ab_tester,
)
mixin.set_current_module(_make_module())
task = _make_task()
result = _make_result()
entry = await mixin.evolve_after_task(task, result)
assert entry.ab_test_result is not None
assert entry.ab_test_result.test_id.startswith("evolve_")
# ── AB 测试失败时回滚 ──────────────────────────────────────
class FailingABTester(ABTester):
"""总是让对照组获胜的 AB 测试器"""
async def evaluate(self, test_id: str) -> ABTestResult | None:
return ABTestResult(
test_id=test_id,
control_metric=0.8,
experiment_metric=0.5,
control_samples=30,
experiment_samples=30,
is_significant=True,
winner="control",
p_value=0.01,
)
@pytest.mark.asyncio
async def test_rollback_when_ab_test_shows_degradation():
"""AB 测试显示退化时执行回滚"""
reflector = LowQualityReflector()
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1)
for i in range(3):
optimizer.add_example(
input_data={"query": f"q_{i}"},
output_data={"result": f"r_{i}"},
quality_score=0.9,
)
ab_tester = FailingABTester()
mixin = EvolutionMixin(
reflector=reflector,
prompt_optimizer=optimizer,
ab_tester=ab_tester,
)
original_module = _make_module()
mixin.set_current_module(original_module)
task = _make_task()
result = _make_result()
entry = await mixin.evolve_after_task(task, result)
assert entry.rolled_back is True
assert entry.applied is False
# 模块不应被更新
assert mixin._current_module.name == "test_module"
# ── 进化历史记录 ──────────────────────────────────────────────
@pytest.mark.asyncio
async def test_evolution_history_is_recorded():
"""进化历史被正确记录"""
reflector = Reflector()
mixin = EvolutionMixin(reflector=reflector)
mixin.set_current_module(_make_module())
task = _make_task()
result = _make_result()
await mixin.evolve_after_task(task, result)
history = mixin.get_evolution_history()
assert len(history) == 1
assert history[0]["task_id"] == "test-001"
assert "reflection" in history[0]
assert history[0]["reflection"]["outcome"] == "success"
@pytest.mark.asyncio
async def test_evolution_history_multiple_entries():
"""多次进化产生多条历史记录"""
reflector = Reflector()
mixin = EvolutionMixin(reflector=reflector)
mixin.set_current_module(_make_module())
for i in range(3):
task = TaskMessage(
task_id=f"test-{i:03d}",
agent_name="evolving_agent",
task_type="echo",
priority=0,
input_data={"query": f"hello_{i}"},
callback_url=None,
created_at=datetime.now(timezone.utc),
)
result = TaskResult(
task_id=f"test-{i:03d}",
agent_name="evolving_agent",
status=TaskStatus.COMPLETED,
output_data={"key": "value"},
error_message=None,
started_at=datetime.now(timezone.utc),
completed_at=datetime.now(timezone.utc),
metrics={"elapsed_seconds": 5.0},
)
await mixin.evolve_after_task(task, result)
history = mixin.get_evolution_history()
assert len(history) == 3
assert history[0]["task_id"] == "test-000"
assert history[1]["task_id"] == "test-001"
assert history[2]["task_id"] == "test-002"
# ── 无组件配置时的优雅降级 ──────────────────────────────────────
@pytest.mark.asyncio
async def test_no_reflector_skips_evolution():
"""没有 Reflector 时跳过进化"""
mixin = EvolutionMixin()
mixin.set_current_module(_make_module())
task = _make_task()
result = _make_result()
entry = await mixin.evolve_after_task(task, result)
assert entry.reflection is None
assert entry.applied is False
@pytest.mark.asyncio
async def test_no_evolution_store_applies_directly():
"""没有 EvolutionStore 时直接在内存中应用变更"""
reflector = LowQualityReflector()
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1)
for i in range(3):
optimizer.add_example(
input_data={"query": f"q_{i}"},
output_data={"result": f"r_{i}"},
quality_score=0.9,
)
mixin = EvolutionMixin(reflector=reflector, prompt_optimizer=optimizer)
mixin.set_current_module(_make_module())
task = _make_task()
result = _make_result()
entry = await mixin.evolve_after_task(task, result)
# 没有 AB tester也没有 store直接应用
assert entry.applied is True
assert mixin._current_module.name == "test_module_optimized"