fischer-agentkit/tests/unit/test_fitness.py

187 lines
6.6 KiB
Python

"""Tests for MultiObjectiveFitness and ExtendedStrategyTuner"""
import pytest
from agentkit.evolution.fitness import (
ExtendedStrategyConfig,
ExtendedStrategyTuner,
FitnessWeights,
MultiObjectiveFitness,
)
from agentkit.evolution.genetic import FitnessScore
class TestFitnessWeights:
"""FitnessWeights unit tests"""
def test_default_weights(self):
w = FitnessWeights()
assert abs(w.accuracy - 0.6) < 0.01
assert abs(w.latency - 0.2) < 0.01
assert abs(w.cost - 0.2) < 0.01
def test_custom_weights(self):
w = FitnessWeights(accuracy=0.5, latency=0.3, cost=0.2)
assert abs(w.accuracy - 0.5) < 0.01
def test_auto_normalization(self):
w = FitnessWeights(accuracy=1.0, latency=1.0, cost=1.0)
assert abs(w.accuracy - 1/3) < 0.01
assert abs(w.latency - 1/3) < 0.01
assert abs(w.cost - 1/3) < 0.01
class TestMultiObjectiveFitness:
"""MultiObjectiveFitness unit tests"""
def setup_method(self):
self.evaluator = MultiObjectiveFitness()
def test_evaluate(self):
score = self.evaluator.evaluate(accuracy=0.9, latency_ms=500, cost_tokens=2000)
assert score.accuracy == 0.9
assert score.latency_ms == 500
assert score.cost_tokens == 2000
def test_evaluate_clamps_accuracy(self):
score = self.evaluator.evaluate(accuracy=1.5)
assert score.accuracy == 1.0
score = self.evaluator.evaluate(accuracy=-0.1)
assert score.accuracy == 0.0
def test_weighted_score(self):
score = self.evaluator.evaluate(accuracy=1.0, latency_ms=0, cost_tokens=0)
weighted = self.evaluator.weighted_score(score)
assert weighted == 1.0 # Perfect on all dimensions
def test_weighted_score_zero(self):
score = self.evaluator.evaluate(accuracy=0.0, latency_ms=10000, cost_tokens=10000)
weighted = self.evaluator.weighted_score(score)
assert weighted == 0.0 # Worst on all dimensions
def test_pareto_rank_simple(self):
scores = [
FitnessScore(accuracy=0.9, latency_ms=100), # Dominates all
FitnessScore(accuracy=0.5, latency_ms=500), # Dominated by 0
FitnessScore(accuracy=0.3, latency_ms=1000), # Dominated by 0, 1
]
ranks = self.evaluator.pareto_rank(scores)
assert ranks[0] == 0 # Front
assert ranks[1] >= 1
assert ranks[2] >= ranks[1]
def test_pareto_rank_empty(self):
ranks = self.evaluator.pareto_rank([])
assert ranks == []
def test_pareto_rank_non_dominated(self):
scores = [
FitnessScore(accuracy=0.9, latency_ms=500), # High accuracy, slow
FitnessScore(accuracy=0.5, latency_ms=100), # Low accuracy, fast
]
ranks = self.evaluator.pareto_rank(scores)
# Neither dominates the other — both on front
assert ranks[0] == 0
assert ranks[1] == 0
def test_crowding_distance(self):
scores = [
FitnessScore(accuracy=0.9, latency_ms=100),
FitnessScore(accuracy=0.7, latency_ms=300),
FitnessScore(accuracy=0.5, latency_ms=500),
]
distances = self.evaluator.crowding_distance(scores)
assert len(distances) == 3
assert distances[0] == float("inf") # Boundary
assert distances[2] == float("inf") # Boundary
assert distances[1] > 0 # Interior point
def test_crowding_distance_small(self):
scores = [FitnessScore(accuracy=0.5)]
distances = self.evaluator.crowding_distance(scores)
assert distances[0] == float("inf")
def test_custom_weights_evaluator(self):
evaluator = MultiObjectiveFitness(weights=FitnessWeights(accuracy=1.0, latency=0.0, cost=0.0))
score = evaluator.evaluate(accuracy=0.8, latency_ms=5000, cost_tokens=5000)
weighted = evaluator.weighted_score(score)
# Only accuracy matters
assert abs(weighted - 0.8) < 0.01
class TestExtendedStrategyTuner:
"""ExtendedStrategyTuner unit tests"""
def setup_method(self):
self.tuner = ExtendedStrategyTuner()
def test_record_and_suggest(self):
config = ExtendedStrategyConfig(temperature=0.5, max_iterations=5, top_k=5)
self.tuner.record(config, 0.7)
self.tuner.record(config, 0.8)
self.tuner.record(config, 0.9)
@pytest.mark.asyncio
async def test_suggest_with_history(self):
config = ExtendedStrategyConfig(temperature=0.7, max_iterations=5, top_k=5)
for i in range(5):
self.tuner.record(config, 0.5 + i * 0.1)
suggested = await self.tuner.suggest(config)
assert isinstance(suggested, ExtendedStrategyConfig)
assert 0.0 <= suggested.temperature <= 2.0
assert 1 <= suggested.max_iterations <= 10
assert 1 <= suggested.top_k <= 20
@pytest.mark.asyncio
async def test_suggest_without_history(self):
config = ExtendedStrategyConfig()
suggested = await self.tuner.suggest(config)
# Should return current config unchanged
assert suggested.temperature == config.temperature
assert suggested.max_iterations == config.max_iterations
@pytest.mark.asyncio
async def test_retrieval_mode_suggestion(self):
config = ExtendedStrategyConfig(retrieval_mode="standard")
enhanced_config = ExtendedStrategyConfig(retrieval_mode="enhanced")
# Record mostly enhanced results
for _ in range(4):
self.tuner.record(enhanced_config, 0.9)
self.tuner.record(config, 0.5)
suggested = await self.tuner.suggest(config)
assert suggested.retrieval_mode == "enhanced"
def test_history_size(self):
assert self.tuner.history_size == 0
self.tuner.record(ExtendedStrategyConfig(), 0.5)
assert self.tuner.history_size == 1
class TestExtendedStrategyConfig:
"""ExtendedStrategyConfig unit tests"""
def test_default_values(self):
config = ExtendedStrategyConfig()
assert config.temperature == 0.5
assert config.max_iterations == 5
assert config.top_k == 5
assert config.retrieval_mode == "enhanced"
assert config.tool_weights == {}
def test_custom_values(self):
config = ExtendedStrategyConfig(
temperature=0.8,
max_iterations=10,
top_k=15,
retrieval_mode="standard",
tool_weights={"search": 0.7, "analyze": 0.3},
)
assert config.temperature == 0.8
assert config.max_iterations == 10
assert config.top_k == 15
assert config.retrieval_mode == "standard"
assert config.tool_weights["search"] == 0.7