187 lines
6.6 KiB
Python
187 lines
6.6 KiB
Python
"""Tests for MultiObjectiveFitness and ExtendedStrategyTuner"""
|
|
|
|
import pytest
|
|
|
|
from agentkit.evolution.fitness import (
|
|
ExtendedStrategyConfig,
|
|
ExtendedStrategyTuner,
|
|
FitnessWeights,
|
|
MultiObjectiveFitness,
|
|
)
|
|
from agentkit.evolution.genetic import FitnessScore
|
|
|
|
|
|
class TestFitnessWeights:
|
|
"""FitnessWeights unit tests"""
|
|
|
|
def test_default_weights(self):
|
|
w = FitnessWeights()
|
|
assert abs(w.accuracy - 0.6) < 0.01
|
|
assert abs(w.latency - 0.2) < 0.01
|
|
assert abs(w.cost - 0.2) < 0.01
|
|
|
|
def test_custom_weights(self):
|
|
w = FitnessWeights(accuracy=0.5, latency=0.3, cost=0.2)
|
|
assert abs(w.accuracy - 0.5) < 0.01
|
|
|
|
def test_auto_normalization(self):
|
|
w = FitnessWeights(accuracy=1.0, latency=1.0, cost=1.0)
|
|
assert abs(w.accuracy - 1/3) < 0.01
|
|
assert abs(w.latency - 1/3) < 0.01
|
|
assert abs(w.cost - 1/3) < 0.01
|
|
|
|
|
|
class TestMultiObjectiveFitness:
|
|
"""MultiObjectiveFitness unit tests"""
|
|
|
|
def setup_method(self):
|
|
self.evaluator = MultiObjectiveFitness()
|
|
|
|
def test_evaluate(self):
|
|
score = self.evaluator.evaluate(accuracy=0.9, latency_ms=500, cost_tokens=2000)
|
|
assert score.accuracy == 0.9
|
|
assert score.latency_ms == 500
|
|
assert score.cost_tokens == 2000
|
|
|
|
def test_evaluate_clamps_accuracy(self):
|
|
score = self.evaluator.evaluate(accuracy=1.5)
|
|
assert score.accuracy == 1.0
|
|
score = self.evaluator.evaluate(accuracy=-0.1)
|
|
assert score.accuracy == 0.0
|
|
|
|
def test_weighted_score(self):
|
|
score = self.evaluator.evaluate(accuracy=1.0, latency_ms=0, cost_tokens=0)
|
|
weighted = self.evaluator.weighted_score(score)
|
|
assert weighted == 1.0 # Perfect on all dimensions
|
|
|
|
def test_weighted_score_zero(self):
|
|
score = self.evaluator.evaluate(accuracy=0.0, latency_ms=10000, cost_tokens=10000)
|
|
weighted = self.evaluator.weighted_score(score)
|
|
assert weighted == 0.0 # Worst on all dimensions
|
|
|
|
def test_pareto_rank_simple(self):
|
|
scores = [
|
|
FitnessScore(accuracy=0.9, latency_ms=100), # Dominates all
|
|
FitnessScore(accuracy=0.5, latency_ms=500), # Dominated by 0
|
|
FitnessScore(accuracy=0.3, latency_ms=1000), # Dominated by 0, 1
|
|
]
|
|
ranks = self.evaluator.pareto_rank(scores)
|
|
assert ranks[0] == 0 # Front
|
|
assert ranks[1] >= 1
|
|
assert ranks[2] >= ranks[1]
|
|
|
|
def test_pareto_rank_empty(self):
|
|
ranks = self.evaluator.pareto_rank([])
|
|
assert ranks == []
|
|
|
|
def test_pareto_rank_non_dominated(self):
|
|
scores = [
|
|
FitnessScore(accuracy=0.9, latency_ms=500), # High accuracy, slow
|
|
FitnessScore(accuracy=0.5, latency_ms=100), # Low accuracy, fast
|
|
]
|
|
ranks = self.evaluator.pareto_rank(scores)
|
|
# Neither dominates the other — both on front
|
|
assert ranks[0] == 0
|
|
assert ranks[1] == 0
|
|
|
|
def test_crowding_distance(self):
|
|
scores = [
|
|
FitnessScore(accuracy=0.9, latency_ms=100),
|
|
FitnessScore(accuracy=0.7, latency_ms=300),
|
|
FitnessScore(accuracy=0.5, latency_ms=500),
|
|
]
|
|
distances = self.evaluator.crowding_distance(scores)
|
|
assert len(distances) == 3
|
|
assert distances[0] == float("inf") # Boundary
|
|
assert distances[2] == float("inf") # Boundary
|
|
assert distances[1] > 0 # Interior point
|
|
|
|
def test_crowding_distance_small(self):
|
|
scores = [FitnessScore(accuracy=0.5)]
|
|
distances = self.evaluator.crowding_distance(scores)
|
|
assert distances[0] == float("inf")
|
|
|
|
def test_custom_weights_evaluator(self):
|
|
evaluator = MultiObjectiveFitness(weights=FitnessWeights(accuracy=1.0, latency=0.0, cost=0.0))
|
|
score = evaluator.evaluate(accuracy=0.8, latency_ms=5000, cost_tokens=5000)
|
|
weighted = evaluator.weighted_score(score)
|
|
# Only accuracy matters
|
|
assert abs(weighted - 0.8) < 0.01
|
|
|
|
|
|
class TestExtendedStrategyTuner:
|
|
"""ExtendedStrategyTuner unit tests"""
|
|
|
|
def setup_method(self):
|
|
self.tuner = ExtendedStrategyTuner()
|
|
|
|
def test_record_and_suggest(self):
|
|
config = ExtendedStrategyConfig(temperature=0.5, max_iterations=5, top_k=5)
|
|
self.tuner.record(config, 0.7)
|
|
self.tuner.record(config, 0.8)
|
|
self.tuner.record(config, 0.9)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_suggest_with_history(self):
|
|
config = ExtendedStrategyConfig(temperature=0.7, max_iterations=5, top_k=5)
|
|
for i in range(5):
|
|
self.tuner.record(config, 0.5 + i * 0.1)
|
|
|
|
suggested = await self.tuner.suggest(config)
|
|
assert isinstance(suggested, ExtendedStrategyConfig)
|
|
assert 0.0 <= suggested.temperature <= 2.0
|
|
assert 1 <= suggested.max_iterations <= 10
|
|
assert 1 <= suggested.top_k <= 20
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_suggest_without_history(self):
|
|
config = ExtendedStrategyConfig()
|
|
suggested = await self.tuner.suggest(config)
|
|
# Should return current config unchanged
|
|
assert suggested.temperature == config.temperature
|
|
assert suggested.max_iterations == config.max_iterations
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_retrieval_mode_suggestion(self):
|
|
config = ExtendedStrategyConfig(retrieval_mode="standard")
|
|
enhanced_config = ExtendedStrategyConfig(retrieval_mode="enhanced")
|
|
|
|
# Record mostly enhanced results
|
|
for _ in range(4):
|
|
self.tuner.record(enhanced_config, 0.9)
|
|
self.tuner.record(config, 0.5)
|
|
|
|
suggested = await self.tuner.suggest(config)
|
|
assert suggested.retrieval_mode == "enhanced"
|
|
|
|
def test_history_size(self):
|
|
assert self.tuner.history_size == 0
|
|
self.tuner.record(ExtendedStrategyConfig(), 0.5)
|
|
assert self.tuner.history_size == 1
|
|
|
|
|
|
class TestExtendedStrategyConfig:
|
|
"""ExtendedStrategyConfig unit tests"""
|
|
|
|
def test_default_values(self):
|
|
config = ExtendedStrategyConfig()
|
|
assert config.temperature == 0.5
|
|
assert config.max_iterations == 5
|
|
assert config.top_k == 5
|
|
assert config.retrieval_mode == "enhanced"
|
|
assert config.tool_weights == {}
|
|
|
|
def test_custom_values(self):
|
|
config = ExtendedStrategyConfig(
|
|
temperature=0.8,
|
|
max_iterations=10,
|
|
top_k=15,
|
|
retrieval_mode="standard",
|
|
tool_weights={"search": 0.7, "analyze": 0.3},
|
|
)
|
|
assert config.temperature == 0.8
|
|
assert config.max_iterations == 10
|
|
assert config.top_k == 15
|
|
assert config.retrieval_mode == "standard"
|
|
assert config.tool_weights["search"] == 0.7
|