383 lines
12 KiB
Python
383 lines
12 KiB
Python
"""Integration tests for the complete evolution loop: reflect → optimize → A/B test → apply/rollback"""
|
|
|
|
import pytest
|
|
from datetime import datetime, timezone
|
|
from unittest.mock import AsyncMock
|
|
|
|
from agentkit.core.protocol import EvolutionEvent, TaskMessage, TaskResult, TaskStatus
|
|
from agentkit.evolution.ab_tester import ABTestConfig, ABTestResult, ABTester
|
|
from agentkit.evolution.evolution_store import EvolutionStore
|
|
from agentkit.evolution.lifecycle import EvolutionMixin
|
|
from agentkit.evolution.prompt_optimizer import Module, PromptOptimizer, Signature
|
|
from agentkit.evolution.reflector import Reflection, Reflector
|
|
|
|
|
|
# ── In-Memory EvolutionStore ───────────────────────────────
|
|
|
|
|
|
class InMemoryEvolutionStore:
|
|
"""In-memory EvolutionStore for testing without PostgreSQL."""
|
|
|
|
def __init__(self):
|
|
self._events: dict[str, dict] = {}
|
|
self._counter = 0
|
|
|
|
async def record(self, event: EvolutionEvent) -> str:
|
|
self._counter += 1
|
|
event_id = f"evt-{self._counter:04d}"
|
|
event.event_id = event_id
|
|
self._events[event_id] = {
|
|
"id": event_id,
|
|
"agent_name": event.agent_name,
|
|
"change_type": event.change_type,
|
|
"before": event.before,
|
|
"after": event.after,
|
|
"metrics": event.metrics,
|
|
"status": "active",
|
|
"created_at": datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
return event_id
|
|
|
|
async def rollback(self, event_id: str) -> bool:
|
|
if event_id in self._events:
|
|
self._events[event_id]["status"] = "rolled_back"
|
|
return True
|
|
return False
|
|
|
|
async def list_events(
|
|
self,
|
|
agent_name: str | None = None,
|
|
change_type: str | None = None,
|
|
status: str | None = None,
|
|
) -> list[dict]:
|
|
results = []
|
|
for event in self._events.values():
|
|
if agent_name and event["agent_name"] != agent_name:
|
|
continue
|
|
if change_type and event["change_type"] != change_type:
|
|
continue
|
|
if status and event["status"] != status:
|
|
continue
|
|
results.append(event)
|
|
return results
|
|
|
|
|
|
# ── Helpers ────────────────────────────────────────────────
|
|
|
|
|
|
def _make_task(task_id: str = "task-001", **input_overrides) -> TaskMessage:
|
|
return TaskMessage(
|
|
task_id=task_id,
|
|
agent_name="evolving_agent",
|
|
task_type="evolution_test",
|
|
priority=1,
|
|
input_data={"query": "test", **input_overrides},
|
|
callback_url=None,
|
|
created_at=datetime.now(timezone.utc),
|
|
)
|
|
|
|
|
|
def _make_result(
|
|
task_id: str = "task-001",
|
|
status: str = TaskStatus.COMPLETED,
|
|
output_data: dict | None = None,
|
|
) -> TaskResult:
|
|
now = datetime.now(timezone.utc)
|
|
return TaskResult(
|
|
task_id=task_id,
|
|
agent_name="evolving_agent",
|
|
status=status,
|
|
output_data=output_data or {"result": "ok"},
|
|
error_message=None,
|
|
started_at=now,
|
|
completed_at=now,
|
|
metrics={"elapsed_seconds": 5.0},
|
|
)
|
|
|
|
|
|
def _default_module() -> Module:
|
|
return Module(
|
|
name="test_module",
|
|
signature=Signature(
|
|
input_fields={"query": "user query"},
|
|
output_fields={"result": "response"},
|
|
instruction="Process the query and return a result",
|
|
),
|
|
template="Query: {query}",
|
|
)
|
|
|
|
|
|
# ── Tests ──────────────────────────────────────────────────
|
|
|
|
|
|
@pytest.mark.integration
|
|
async def test_reflector_generates_reflection():
|
|
"""After 5 task executions, Reflector generates reflection."""
|
|
reflector = Reflector()
|
|
|
|
# Execute 5 tasks and collect reflections
|
|
reflections = []
|
|
for i in range(5):
|
|
task = _make_task(task_id=f"task-{i:03d}")
|
|
result = _make_result(task_id=f"task-{i:03d}")
|
|
reflection = await reflector.reflect(task, result)
|
|
reflections.append(reflection)
|
|
|
|
# All 5 reflections should be generated
|
|
assert len(reflections) == 5
|
|
for r in reflections:
|
|
assert isinstance(r, Reflection)
|
|
assert r.outcome == "success"
|
|
assert 0.0 <= r.quality_score <= 1.0
|
|
|
|
# The last reflection should have accumulated patterns
|
|
last = reflections[-1]
|
|
assert last.task_id == "task-004"
|
|
|
|
|
|
@pytest.mark.integration
|
|
async def test_prompt_optimizer_generates_few_shot():
|
|
"""PromptOptimizer generates few-shot examples from successful cases."""
|
|
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=3)
|
|
|
|
# Add 4 successful examples (above 0.7 quality threshold)
|
|
for i in range(4):
|
|
optimizer.add_example(
|
|
input_data={"query": f"question {i}"},
|
|
output_data={"result": f"answer {i}"},
|
|
quality_score=0.8 + i * 0.05,
|
|
)
|
|
|
|
# Add 1 failure example
|
|
optimizer.add_example(
|
|
input_data={"query": "bad question"},
|
|
output_data={"result": "error"},
|
|
quality_score=0.2,
|
|
)
|
|
|
|
success_count, failure_count = optimizer.example_count
|
|
assert success_count == 4
|
|
assert failure_count == 1
|
|
|
|
# Optimize
|
|
module = _default_module()
|
|
optimized = await optimizer.optimize(module)
|
|
|
|
# Should have generated demos from successful cases
|
|
assert optimized.name == "test_module_optimized"
|
|
assert len(optimized.demos) == 3 # max_demos=3
|
|
assert optimized.signature.instruction != module.signature.instruction # enhanced
|
|
|
|
|
|
@pytest.mark.integration
|
|
async def test_ab_tester_auto_apply_on_improvement():
|
|
"""ABTester: experiment group improves → auto-apply."""
|
|
import random
|
|
|
|
ab_tester = ABTester()
|
|
|
|
config = ABTestConfig(
|
|
test_id="test-improve-001",
|
|
agent_name="evolving_agent",
|
|
change_type="prompt",
|
|
min_samples=30,
|
|
)
|
|
ab_tester.create_test(config)
|
|
|
|
# Record results where experiment group outperforms control with some variance
|
|
random.seed(42)
|
|
for _ in range(config.min_samples):
|
|
control_val = 0.5 + random.gauss(0, 0.05)
|
|
experiment_val = 0.8 + random.gauss(0, 0.05)
|
|
ab_tester.record_result("test-improve-001", "control", control_val)
|
|
ab_tester.record_result("test-improve-001", "experiment", experiment_val)
|
|
|
|
result = await ab_tester.evaluate("test-improve-001")
|
|
|
|
assert result is not None
|
|
assert result.winner == "experiment"
|
|
assert result.experiment_metric > result.control_metric
|
|
|
|
|
|
@pytest.mark.integration
|
|
async def test_ab_tester_auto_rollback_on_degradation():
|
|
"""ABTester: experiment group degrades → auto-rollback."""
|
|
import random
|
|
|
|
ab_tester = ABTester()
|
|
|
|
config = ABTestConfig(
|
|
test_id="test-degrade-001",
|
|
agent_name="evolving_agent",
|
|
change_type="prompt",
|
|
min_samples=30,
|
|
)
|
|
ab_tester.create_test(config)
|
|
|
|
# Record results where experiment group is worse than control with some variance
|
|
random.seed(42)
|
|
for _ in range(config.min_samples):
|
|
control_val = 0.8 + random.gauss(0, 0.05)
|
|
experiment_val = 0.3 + random.gauss(0, 0.05)
|
|
ab_tester.record_result("test-degrade-001", "control", control_val)
|
|
ab_tester.record_result("test-degrade-001", "experiment", experiment_val)
|
|
|
|
result = await ab_tester.evaluate("test-degrade-001")
|
|
|
|
assert result is not None
|
|
assert result.winner == "control"
|
|
assert result.experiment_metric < result.control_metric
|
|
|
|
|
|
@pytest.mark.integration
|
|
async def test_evolution_store_records_and_queries():
|
|
"""EvolutionStore records all changes, supports history query."""
|
|
store = InMemoryEvolutionStore()
|
|
|
|
# Record multiple events
|
|
event1 = EvolutionEvent(
|
|
agent_name="agent_a",
|
|
change_type="prompt",
|
|
before={"module": "v1"},
|
|
after={"module": "v2"},
|
|
metrics={"quality_score": 0.7},
|
|
)
|
|
event2 = EvolutionEvent(
|
|
agent_name="agent_a",
|
|
change_type="strategy",
|
|
before={"strategy": "default"},
|
|
after={"strategy": "optimized"},
|
|
metrics={"quality_score": 0.8},
|
|
)
|
|
event3 = EvolutionEvent(
|
|
agent_name="agent_b",
|
|
change_type="prompt",
|
|
before={"module": "v1"},
|
|
after={"module": "v3"},
|
|
metrics={"quality_score": 0.6},
|
|
)
|
|
|
|
id1 = await store.record(event1)
|
|
id2 = await store.record(event2)
|
|
id3 = await store.record(event3)
|
|
|
|
assert id1 is not None
|
|
assert id2 is not None
|
|
assert id3 is not None
|
|
|
|
# Query by agent_name
|
|
agent_a_events = await store.list_events(agent_name="agent_a")
|
|
assert len(agent_a_events) == 2
|
|
|
|
# Query by change_type
|
|
prompt_events = await store.list_events(change_type="prompt")
|
|
assert len(prompt_events) == 2
|
|
|
|
# Rollback an event
|
|
rolled_back = await store.rollback(id1)
|
|
assert rolled_back is True
|
|
|
|
# Query active events for agent_a
|
|
active_events = await store.list_events(agent_name="agent_a", status="active")
|
|
assert len(active_events) == 1
|
|
|
|
rolled_back_events = await store.list_events(status="rolled_back")
|
|
assert len(rolled_back_events) == 1
|
|
|
|
|
|
@pytest.mark.integration
|
|
async def test_full_evolution_loop_apply():
|
|
"""Full evolution loop: reflect → optimize → A/B test → apply (experiment wins)."""
|
|
reflector = Reflector()
|
|
optimizer = PromptOptimizer(max_demos=2, min_examples_for_optimization=2)
|
|
ab_tester = ABTester()
|
|
store = InMemoryEvolutionStore()
|
|
|
|
mixin = EvolutionMixin(
|
|
reflector=reflector,
|
|
prompt_optimizer=optimizer,
|
|
ab_tester=ab_tester,
|
|
evolution_store=store,
|
|
)
|
|
|
|
module = _default_module()
|
|
mixin.set_current_module(module)
|
|
|
|
# Simulate task execution and evolution
|
|
task = _make_task(task_id="evolve-task-001")
|
|
result = _make_result(task_id="evolve-task-001")
|
|
|
|
# Pre-populate optimizer with enough examples to trigger optimization
|
|
for i in range(3):
|
|
optimizer.add_example(
|
|
input_data={"query": f"q{i}"},
|
|
output_data={"result": f"a{i}"},
|
|
quality_score=0.85,
|
|
)
|
|
|
|
log_entry = await mixin.evolve_after_task(task, result)
|
|
|
|
# The evolution should have completed
|
|
assert log_entry is not None
|
|
assert log_entry.task_id == "evolve-task-001"
|
|
|
|
# Check evolution history
|
|
history = mixin.get_evolution_history()
|
|
assert len(history) >= 1
|
|
assert history[0]["task_id"] == "evolve-task-001"
|
|
|
|
|
|
@pytest.mark.integration
|
|
async def test_full_evolution_loop_rollback():
|
|
"""Full evolution loop with rollback when experiment degrades."""
|
|
# Custom reflector that produces low-quality suggestions
|
|
reflector = Reflector()
|
|
optimizer = PromptOptimizer(max_demos=2, min_examples_for_optimization=2)
|
|
ab_tester = ABTester()
|
|
store = InMemoryEvolutionStore()
|
|
|
|
mixin = EvolutionMixin(
|
|
reflector=reflector,
|
|
prompt_optimizer=optimizer,
|
|
ab_tester=ab_tester,
|
|
evolution_store=store,
|
|
)
|
|
|
|
module = _default_module()
|
|
mixin.set_current_module(module)
|
|
|
|
# Pre-populate optimizer with enough examples
|
|
for i in range(3):
|
|
optimizer.add_example(
|
|
input_data={"query": f"q{i}"},
|
|
output_data={"result": f"a{i}"},
|
|
quality_score=0.85,
|
|
)
|
|
|
|
# Create a task that will trigger evolution but with degraded experiment
|
|
task = _make_task(task_id="evolve-rollback-001")
|
|
result = _make_result(task_id="evolve-rollback-001")
|
|
|
|
log_entry = await mixin.evolve_after_task(task, result)
|
|
|
|
assert log_entry is not None
|
|
# The AB test in EvolutionMixin records experiment_score = quality_score + 0.1
|
|
# which should be higher than control, so it should be applied
|
|
# To test rollback, we need to manipulate the AB tester directly
|
|
|
|
# Direct rollback test via store
|
|
event = EvolutionEvent(
|
|
agent_name="evolving_agent",
|
|
change_type="prompt",
|
|
before={"module": "v1"},
|
|
after={"module": "v2_bad"},
|
|
metrics={"quality_score": 0.3},
|
|
)
|
|
event_id = await store.record(event)
|
|
rolled_back = await store.rollback(event_id)
|
|
assert rolled_back is True
|
|
|
|
# Verify it's marked as rolled_back
|
|
rolled_events = await store.list_events(status="rolled_back")
|
|
assert any(e["id"] == event_id for e in rolled_events)
|