"""Tests for ABTester - A/B 测试框架""" import pytest from agentkit.evolution.ab_tester import ABTestConfig, ABTestResult, ABTester from agentkit.evolution.evolution_store import InMemoryEvolutionStore def _make_config(test_id: str = "test-001", min_samples: int = 10) -> ABTestConfig: return ABTestConfig( test_id=test_id, agent_name="test_agent", change_type="prompt", min_samples=min_samples, ) # ── Hash-based deterministic group assignment ────────────────── class TestHashBasedAssignment: """测试 hash-based 确定性分组""" def test_same_task_id_same_group(self): """同一 task_id 总是分配到同一组""" tester = ABTester() tester.create_test(_make_config()) group1 = tester.assign_group("test-001", task_id="task-abc") group2 = tester.assign_group("test-001", task_id="task-abc") assert group1 == group2 def test_different_task_ids_may_differ(self): """不同 task_id 可能分配到不同组""" tester = ABTester() tester.create_test(_make_config()) groups = set() for i in range(20): group = tester.assign_group("test-001", task_id=f"task-{i}") groups.add(group) # With 20 different task_ids, we should see both groups assert len(groups) == 2 def test_no_test_returns_control(self): """不存在的 test_id 返回 control""" tester = ABTester() group = tester.assign_group("nonexistent", task_id="task-1") assert group == "control" def test_deterministic_across_instances(self): """不同 ABTester 实例对同一 task_id 分配结果一致""" tester1 = ABTester() tester1.create_test(_make_config()) tester2 = ABTester() tester2.create_test(_make_config()) for i in range(10): g1 = tester1.assign_group("test-001", task_id=f"task-{i}") g2 = tester2.assign_group("test-001", task_id=f"task-{i}") assert g1 == g2 # ── Min samples configuration ────────────────────────────────── class TestMinSamples: """测试最小样本量配置""" def test_default_min_samples(self): """默认 min_samples 为 10""" tester = ABTester() assert tester._default_min_samples == 10 def test_custom_min_samples(self): """自定义 min_samples""" tester = ABTester(min_samples=5) assert tester._default_min_samples == 5 @pytest.mark.asyncio async def test_insufficient_samples_not_significant(self): """样本不足时结果不显著""" tester = ABTester(min_samples=5) tester.create_test(_make_config(min_samples=5)) # Add only 3 results per group for i in range(3): tester.record_result("test-001", "control", 0.5) tester.record_result("test-001", "experiment", 0.8) result = await tester.evaluate("test-001") assert result is not None assert result.is_significant is False assert result.winner is None @pytest.mark.asyncio async def test_sufficient_samples_can_be_significant(self): """样本充足时结果可以显著""" tester = ABTester(min_samples=5) tester.create_test(_make_config(min_samples=5)) # Add 10 results per group with clear difference for i in range(10): tester.record_result("test-001", "control", 0.3) tester.record_result("test-001", "experiment", 0.9) result = await tester.evaluate("test-001") assert result is not None assert result.is_significant is True assert result.winner == "experiment" # ── Persistence ──────────────────────────────────────────────── class TestPersistence: """测试结果持久化""" @pytest.mark.asyncio async def test_persist_results_to_store(self): """结果持久化到 EvolutionStore""" store = InMemoryEvolutionStore() tester = ABTester(evolution_store=store, min_samples=10) tester.create_test(_make_config()) # Add some results tester.record_result("test-001", "control", 0.5) tester.record_result("test-001", "experiment", 0.8) await tester.persist_results("test-001") # Check store has the results stored = await store.get_ab_test_results("test-001") assert len(stored) == 2 variants = {r["variant"] for r in stored} assert variants == {"control", "experiment"} @pytest.mark.asyncio async def test_persist_without_store_is_noop(self): """没有 EvolutionStore 时持久化是无操作""" tester = ABTester(min_samples=10) tester.create_test(_make_config()) tester.record_result("test-001", "control", 0.5) # Should not raise await tester.persist_results("test-001") @pytest.mark.asyncio async def test_persist_empty_results_is_noop(self): """没有结果时持久化是无操作""" store = InMemoryEvolutionStore() tester = ABTester(evolution_store=store, min_samples=10) tester.create_test(_make_config()) # No results recorded yet await tester.persist_results("test-001") stored = await store.get_ab_test_results("test-001") assert len(stored) == 0 # ── Evaluate ─────────────────────────────────────────────────── class TestEvaluate: """测试评估逻辑""" @pytest.mark.asyncio async def test_evaluate_nonexistent_test(self): """评估不存在的测试返回 None""" tester = ABTester() result = await tester.evaluate("nonexistent") assert result is None @pytest.mark.asyncio async def test_evaluate_experiment_wins(self): """实验组获胜时 winner 为 experiment""" tester = ABTester(min_samples=5) tester.create_test(_make_config(min_samples=5)) for i in range(10): tester.record_result("test-001", "control", 0.3) tester.record_result("test-001", "experiment", 0.9) result = await tester.evaluate("test-001") assert result is not None assert result.winner == "experiment" assert result.experiment_metric > result.control_metric @pytest.mark.asyncio async def test_evaluate_control_wins(self): """对照组获胜时 winner 为 control""" tester = ABTester(min_samples=5) tester.create_test(_make_config(min_samples=5)) for i in range(10): tester.record_result("test-001", "control", 0.9) tester.record_result("test-001", "experiment", 0.3) result = await tester.evaluate("test-001") assert result is not None assert result.winner == "control" assert result.control_metric > result.experiment_metric