fischer-agentkit/tests/integration/test_evolution_loop.py

383 lines
12 KiB
Python

"""Integration tests for the complete evolution loop: reflect → optimize → A/B test → apply/rollback"""
import pytest
from datetime import datetime, timezone
from unittest.mock import AsyncMock
from agentkit.core.protocol import EvolutionEvent, TaskMessage, TaskResult, TaskStatus
from agentkit.evolution.ab_tester import ABTestConfig, ABTestResult, ABTester
from agentkit.evolution.evolution_store import EvolutionStore
from agentkit.evolution.lifecycle import EvolutionMixin
from agentkit.evolution.prompt_optimizer import Module, PromptOptimizer, Signature
from agentkit.evolution.reflector import Reflection, Reflector
# ── In-Memory EvolutionStore ───────────────────────────────
class InMemoryEvolutionStore:
"""In-memory EvolutionStore for testing without PostgreSQL."""
def __init__(self):
self._events: dict[str, dict] = {}
self._counter = 0
async def record(self, event: EvolutionEvent) -> str:
self._counter += 1
event_id = f"evt-{self._counter:04d}"
event.event_id = event_id
self._events[event_id] = {
"id": event_id,
"agent_name": event.agent_name,
"change_type": event.change_type,
"before": event.before,
"after": event.after,
"metrics": event.metrics,
"status": "active",
"created_at": datetime.now(timezone.utc).isoformat(),
}
return event_id
async def rollback(self, event_id: str) -> bool:
if event_id in self._events:
self._events[event_id]["status"] = "rolled_back"
return True
return False
async def list_events(
self,
agent_name: str | None = None,
change_type: str | None = None,
status: str | None = None,
) -> list[dict]:
results = []
for event in self._events.values():
if agent_name and event["agent_name"] != agent_name:
continue
if change_type and event["change_type"] != change_type:
continue
if status and event["status"] != status:
continue
results.append(event)
return results
# ── Helpers ────────────────────────────────────────────────
def _make_task(task_id: str = "task-001", **input_overrides) -> TaskMessage:
return TaskMessage(
task_id=task_id,
agent_name="evolving_agent",
task_type="evolution_test",
priority=1,
input_data={"query": "test", **input_overrides},
callback_url=None,
created_at=datetime.now(timezone.utc),
)
def _make_result(
task_id: str = "task-001",
status: str = TaskStatus.COMPLETED,
output_data: dict | None = None,
) -> TaskResult:
now = datetime.now(timezone.utc)
return TaskResult(
task_id=task_id,
agent_name="evolving_agent",
status=status,
output_data=output_data or {"result": "ok"},
error_message=None,
started_at=now,
completed_at=now,
metrics={"elapsed_seconds": 5.0},
)
def _default_module() -> Module:
return Module(
name="test_module",
signature=Signature(
input_fields={"query": "user query"},
output_fields={"result": "response"},
instruction="Process the query and return a result",
),
template="Query: {query}",
)
# ── Tests ──────────────────────────────────────────────────
@pytest.mark.integration
async def test_reflector_generates_reflection():
"""After 5 task executions, Reflector generates reflection."""
reflector = Reflector()
# Execute 5 tasks and collect reflections
reflections = []
for i in range(5):
task = _make_task(task_id=f"task-{i:03d}")
result = _make_result(task_id=f"task-{i:03d}")
reflection = await reflector.reflect(task, result)
reflections.append(reflection)
# All 5 reflections should be generated
assert len(reflections) == 5
for r in reflections:
assert isinstance(r, Reflection)
assert r.outcome == "success"
assert 0.0 <= r.quality_score <= 1.0
# The last reflection should have accumulated patterns
last = reflections[-1]
assert last.task_id == "task-004"
@pytest.mark.integration
async def test_prompt_optimizer_generates_few_shot():
"""PromptOptimizer generates few-shot examples from successful cases."""
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=3)
# Add 4 successful examples (above 0.7 quality threshold)
for i in range(4):
optimizer.add_example(
input_data={"query": f"question {i}"},
output_data={"result": f"answer {i}"},
quality_score=0.8 + i * 0.05,
)
# Add 1 failure example
optimizer.add_example(
input_data={"query": "bad question"},
output_data={"result": "error"},
quality_score=0.2,
)
success_count, failure_count = optimizer.example_count
assert success_count == 4
assert failure_count == 1
# Optimize
module = _default_module()
optimized = await optimizer.optimize(module)
# Should have generated demos from successful cases
assert optimized.name == "test_module_optimized"
assert len(optimized.demos) == 3 # max_demos=3
assert optimized.signature.instruction != module.signature.instruction # enhanced
@pytest.mark.integration
async def test_ab_tester_auto_apply_on_improvement():
"""ABTester: experiment group improves → auto-apply."""
import random
ab_tester = ABTester()
config = ABTestConfig(
test_id="test-improve-001",
agent_name="evolving_agent",
change_type="prompt",
min_samples=30,
)
ab_tester.create_test(config)
# Record results where experiment group outperforms control with some variance
random.seed(42)
for _ in range(config.min_samples):
control_val = 0.5 + random.gauss(0, 0.05)
experiment_val = 0.8 + random.gauss(0, 0.05)
ab_tester.record_result("test-improve-001", "control", control_val)
ab_tester.record_result("test-improve-001", "experiment", experiment_val)
result = await ab_tester.evaluate("test-improve-001")
assert result is not None
assert result.winner == "experiment"
assert result.experiment_metric > result.control_metric
@pytest.mark.integration
async def test_ab_tester_auto_rollback_on_degradation():
"""ABTester: experiment group degrades → auto-rollback."""
import random
ab_tester = ABTester()
config = ABTestConfig(
test_id="test-degrade-001",
agent_name="evolving_agent",
change_type="prompt",
min_samples=30,
)
ab_tester.create_test(config)
# Record results where experiment group is worse than control with some variance
random.seed(42)
for _ in range(config.min_samples):
control_val = 0.8 + random.gauss(0, 0.05)
experiment_val = 0.3 + random.gauss(0, 0.05)
ab_tester.record_result("test-degrade-001", "control", control_val)
ab_tester.record_result("test-degrade-001", "experiment", experiment_val)
result = await ab_tester.evaluate("test-degrade-001")
assert result is not None
assert result.winner == "control"
assert result.experiment_metric < result.control_metric
@pytest.mark.integration
async def test_evolution_store_records_and_queries():
"""EvolutionStore records all changes, supports history query."""
store = InMemoryEvolutionStore()
# Record multiple events
event1 = EvolutionEvent(
agent_name="agent_a",
change_type="prompt",
before={"module": "v1"},
after={"module": "v2"},
metrics={"quality_score": 0.7},
)
event2 = EvolutionEvent(
agent_name="agent_a",
change_type="strategy",
before={"strategy": "default"},
after={"strategy": "optimized"},
metrics={"quality_score": 0.8},
)
event3 = EvolutionEvent(
agent_name="agent_b",
change_type="prompt",
before={"module": "v1"},
after={"module": "v3"},
metrics={"quality_score": 0.6},
)
id1 = await store.record(event1)
id2 = await store.record(event2)
id3 = await store.record(event3)
assert id1 is not None
assert id2 is not None
assert id3 is not None
# Query by agent_name
agent_a_events = await store.list_events(agent_name="agent_a")
assert len(agent_a_events) == 2
# Query by change_type
prompt_events = await store.list_events(change_type="prompt")
assert len(prompt_events) == 2
# Rollback an event
rolled_back = await store.rollback(id1)
assert rolled_back is True
# Query active events for agent_a
active_events = await store.list_events(agent_name="agent_a", status="active")
assert len(active_events) == 1
rolled_back_events = await store.list_events(status="rolled_back")
assert len(rolled_back_events) == 1
@pytest.mark.integration
async def test_full_evolution_loop_apply():
"""Full evolution loop: reflect → optimize → A/B test → apply (experiment wins)."""
reflector = Reflector()
optimizer = PromptOptimizer(max_demos=2, min_examples_for_optimization=2)
ab_tester = ABTester()
store = InMemoryEvolutionStore()
mixin = EvolutionMixin(
reflector=reflector,
prompt_optimizer=optimizer,
ab_tester=ab_tester,
evolution_store=store,
)
module = _default_module()
mixin.set_current_module(module)
# Simulate task execution and evolution
task = _make_task(task_id="evolve-task-001")
result = _make_result(task_id="evolve-task-001")
# Pre-populate optimizer with enough examples to trigger optimization
for i in range(3):
optimizer.add_example(
input_data={"query": f"q{i}"},
output_data={"result": f"a{i}"},
quality_score=0.85,
)
log_entry = await mixin.evolve_after_task(task, result)
# The evolution should have completed
assert log_entry is not None
assert log_entry.task_id == "evolve-task-001"
# Check evolution history
history = mixin.get_evolution_history()
assert len(history) >= 1
assert history[0]["task_id"] == "evolve-task-001"
@pytest.mark.integration
async def test_full_evolution_loop_rollback():
"""Full evolution loop with rollback when experiment degrades."""
# Custom reflector that produces low-quality suggestions
reflector = Reflector()
optimizer = PromptOptimizer(max_demos=2, min_examples_for_optimization=2)
ab_tester = ABTester()
store = InMemoryEvolutionStore()
mixin = EvolutionMixin(
reflector=reflector,
prompt_optimizer=optimizer,
ab_tester=ab_tester,
evolution_store=store,
)
module = _default_module()
mixin.set_current_module(module)
# Pre-populate optimizer with enough examples
for i in range(3):
optimizer.add_example(
input_data={"query": f"q{i}"},
output_data={"result": f"a{i}"},
quality_score=0.85,
)
# Create a task that will trigger evolution but with degraded experiment
task = _make_task(task_id="evolve-rollback-001")
result = _make_result(task_id="evolve-rollback-001")
log_entry = await mixin.evolve_after_task(task, result)
assert log_entry is not None
# The AB test in EvolutionMixin records experiment_score = quality_score + 0.1
# which should be higher than control, so it should be applied
# To test rollback, we need to manipulate the AB tester directly
# Direct rollback test via store
event = EvolutionEvent(
agent_name="evolving_agent",
change_type="prompt",
before={"module": "v1"},
after={"module": "v2_bad"},
metrics={"quality_score": 0.3},
)
event_id = await store.record(event)
rolled_back = await store.rollback(event_id)
assert rolled_back is True
# Verify it's marked as rolled_back
rolled_events = await store.list_events(status="rolled_back")
assert any(e["id"] == event_id for e in rolled_events)