fischer-agentkit/tests/unit/test_ab_tester.py

206 lines
7.0 KiB
Python

"""Tests for ABTester - A/B 测试框架"""
import pytest
from agentkit.evolution.ab_tester import ABTestConfig, ABTestResult, ABTester
from agentkit.evolution.evolution_store import InMemoryEvolutionStore
def _make_config(test_id: str = "test-001", min_samples: int = 10) -> ABTestConfig:
return ABTestConfig(
test_id=test_id,
agent_name="test_agent",
change_type="prompt",
min_samples=min_samples,
)
# ── Hash-based deterministic group assignment ──────────────────
class TestHashBasedAssignment:
"""测试 hash-based 确定性分组"""
def test_same_task_id_same_group(self):
"""同一 task_id 总是分配到同一组"""
tester = ABTester()
tester.create_test(_make_config())
group1 = tester.assign_group("test-001", task_id="task-abc")
group2 = tester.assign_group("test-001", task_id="task-abc")
assert group1 == group2
def test_different_task_ids_may_differ(self):
"""不同 task_id 可能分配到不同组"""
tester = ABTester()
tester.create_test(_make_config())
groups = set()
for i in range(20):
group = tester.assign_group("test-001", task_id=f"task-{i}")
groups.add(group)
# With 20 different task_ids, we should see both groups
assert len(groups) == 2
def test_no_test_returns_control(self):
"""不存在的 test_id 返回 control"""
tester = ABTester()
group = tester.assign_group("nonexistent", task_id="task-1")
assert group == "control"
def test_deterministic_across_instances(self):
"""不同 ABTester 实例对同一 task_id 分配结果一致"""
tester1 = ABTester()
tester1.create_test(_make_config())
tester2 = ABTester()
tester2.create_test(_make_config())
for i in range(10):
g1 = tester1.assign_group("test-001", task_id=f"task-{i}")
g2 = tester2.assign_group("test-001", task_id=f"task-{i}")
assert g1 == g2
# ── Min samples configuration ──────────────────────────────────
class TestMinSamples:
"""测试最小样本量配置"""
def test_default_min_samples(self):
"""默认 min_samples 为 10"""
tester = ABTester()
assert tester._default_min_samples == 10
def test_custom_min_samples(self):
"""自定义 min_samples"""
tester = ABTester(min_samples=5)
assert tester._default_min_samples == 5
@pytest.mark.asyncio
async def test_insufficient_samples_not_significant(self):
"""样本不足时结果不显著"""
tester = ABTester(min_samples=5)
tester.create_test(_make_config(min_samples=5))
# Add only 3 results per group
for i in range(3):
tester.record_result("test-001", "control", 0.5)
tester.record_result("test-001", "experiment", 0.8)
result = await tester.evaluate("test-001")
assert result is not None
assert result.is_significant is False
assert result.winner is None
@pytest.mark.asyncio
async def test_sufficient_samples_can_be_significant(self):
"""样本充足时结果可以显著"""
tester = ABTester(min_samples=5)
tester.create_test(_make_config(min_samples=5))
# Add 10 results per group with clear difference
for i in range(10):
tester.record_result("test-001", "control", 0.3)
tester.record_result("test-001", "experiment", 0.9)
result = await tester.evaluate("test-001")
assert result is not None
assert result.is_significant is True
assert result.winner == "experiment"
# ── Persistence ────────────────────────────────────────────────
class TestPersistence:
"""测试结果持久化"""
@pytest.mark.asyncio
async def test_persist_results_to_store(self):
"""结果持久化到 EvolutionStore"""
store = InMemoryEvolutionStore()
tester = ABTester(evolution_store=store, min_samples=10)
tester.create_test(_make_config())
# Add some results
tester.record_result("test-001", "control", 0.5)
tester.record_result("test-001", "experiment", 0.8)
await tester.persist_results("test-001")
# Check store has the results
stored = await store.get_ab_test_results("test-001")
assert len(stored) == 2
variants = {r["variant"] for r in stored}
assert variants == {"control", "experiment"}
@pytest.mark.asyncio
async def test_persist_without_store_is_noop(self):
"""没有 EvolutionStore 时持久化是无操作"""
tester = ABTester(min_samples=10)
tester.create_test(_make_config())
tester.record_result("test-001", "control", 0.5)
# Should not raise
await tester.persist_results("test-001")
@pytest.mark.asyncio
async def test_persist_empty_results_is_noop(self):
"""没有结果时持久化是无操作"""
store = InMemoryEvolutionStore()
tester = ABTester(evolution_store=store, min_samples=10)
tester.create_test(_make_config())
# No results recorded yet
await tester.persist_results("test-001")
stored = await store.get_ab_test_results("test-001")
assert len(stored) == 0
# ── Evaluate ───────────────────────────────────────────────────
class TestEvaluate:
"""测试评估逻辑"""
@pytest.mark.asyncio
async def test_evaluate_nonexistent_test(self):
"""评估不存在的测试返回 None"""
tester = ABTester()
result = await tester.evaluate("nonexistent")
assert result is None
@pytest.mark.asyncio
async def test_evaluate_experiment_wins(self):
"""实验组获胜时 winner 为 experiment"""
tester = ABTester(min_samples=5)
tester.create_test(_make_config(min_samples=5))
for i in range(10):
tester.record_result("test-001", "control", 0.3)
tester.record_result("test-001", "experiment", 0.9)
result = await tester.evaluate("test-001")
assert result is not None
assert result.winner == "experiment"
assert result.experiment_metric > result.control_metric
@pytest.mark.asyncio
async def test_evaluate_control_wins(self):
"""对照组获胜时 winner 为 control"""
tester = ABTester(min_samples=5)
tester.create_test(_make_config(min_samples=5))
for i in range(10):
tester.record_result("test-001", "control", 0.9)
tester.record_result("test-001", "experiment", 0.3)
result = await tester.evaluate("test-001")
assert result is not None
assert result.winner == "control"
assert result.control_metric > result.experiment_metric