132 lines
3.7 KiB
Python
132 lines
3.7 KiB
Python
"""Tests for Evolution system"""
|
|
|
|
import pytest
|
|
|
|
from agentkit.evolution.reflector import Reflector, Reflection
|
|
from agentkit.evolution.prompt_optimizer import PromptOptimizer, Signature, Module
|
|
from agentkit.evolution.strategy_tuner import StrategyTuner, StrategyConfig
|
|
from agentkit.evolution.ab_tester import ABTester, ABTestConfig
|
|
from agentkit.core.protocol import TaskMessage, TaskResult, TaskStatus
|
|
from datetime import datetime, timezone
|
|
|
|
|
|
def _make_task() -> TaskMessage:
|
|
return TaskMessage(
|
|
task_id="test-001",
|
|
agent_name="test",
|
|
task_type="echo",
|
|
priority=0,
|
|
input_data={},
|
|
callback_url=None,
|
|
created_at=datetime.now(timezone.utc),
|
|
)
|
|
|
|
|
|
def _make_result(status: str = TaskStatus.COMPLETED) -> TaskResult:
|
|
return TaskResult(
|
|
task_id="test-001",
|
|
agent_name="test",
|
|
status=status,
|
|
output_data={"key": "value"},
|
|
error_message=None,
|
|
started_at=datetime.now(timezone.utc),
|
|
completed_at=datetime.now(timezone.utc),
|
|
metrics={"elapsed_seconds": 5.0},
|
|
)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_reflector_success():
|
|
reflector = Reflector()
|
|
task = _make_task()
|
|
result = _make_result()
|
|
|
|
reflection = await reflector.reflect(task, result)
|
|
assert reflection.outcome == "success"
|
|
assert reflection.quality_score > 0
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_reflector_failure():
|
|
reflector = Reflector()
|
|
task = _make_task()
|
|
result = _make_result(TaskStatus.FAILED)
|
|
result.error_message = "something went wrong"
|
|
|
|
reflection = await reflector.reflect(task, result)
|
|
assert reflection.outcome == "failure"
|
|
assert reflection.quality_score == 0.0
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_prompt_optimizer():
|
|
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=2)
|
|
|
|
# Add examples
|
|
for i in range(5):
|
|
optimizer.add_example(
|
|
input_data={"query": f"query_{i}"},
|
|
output_data={"result": f"result_{i}"},
|
|
quality_score=0.8 + i * 0.02,
|
|
)
|
|
|
|
module = Module(
|
|
name="test_module",
|
|
signature=Signature(
|
|
input_fields={"query": "search query"},
|
|
output_fields={"result": "search result"},
|
|
instruction="Find the best result.",
|
|
),
|
|
)
|
|
|
|
optimized = await optimizer.optimize(module)
|
|
assert optimized.name == "test_module_optimized"
|
|
assert len(optimized.demos) == 3
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_prompt_optimizer_not_enough_examples():
|
|
optimizer = PromptOptimizer(min_examples_for_optimization=10)
|
|
module = Module(
|
|
name="test",
|
|
signature=Signature(
|
|
input_fields={"x": "input"},
|
|
output_fields={"y": "output"},
|
|
),
|
|
)
|
|
|
|
optimized = await optimizer.optimize(module)
|
|
# Should return unchanged module
|
|
assert optimized.name == "test"
|
|
|
|
|
|
def test_strategy_tuner():
|
|
tuner = StrategyTuner()
|
|
|
|
config = StrategyConfig(temperature=0.5)
|
|
tuner.record(config, metric=0.6)
|
|
tuner.record(StrategyConfig(temperature=0.7), metric=0.8)
|
|
tuner.record(StrategyConfig(temperature=0.3), metric=0.4)
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_ab_tester():
|
|
tester = ABTester()
|
|
test_config = ABTestConfig(
|
|
test_id="test-1",
|
|
agent_name="test_agent",
|
|
change_type="prompt",
|
|
min_samples=5,
|
|
)
|
|
tester.create_test(test_config)
|
|
|
|
# Record results
|
|
for _ in range(10):
|
|
group = tester.assign_group("test-1")
|
|
metric = 0.7 if group == "experiment" else 0.5
|
|
tester.record_result("test-1", group, metric)
|
|
|
|
result = await tester.evaluate("test-1")
|
|
assert result is not None
|
|
assert result.control_samples + result.experiment_samples == 10
|