fischer-agentkit/tests/unit/test_ab_tester.py

"""Tests for ABTester - A/B 测试框架"""

import pytest

from agentkit.evolution.ab_tester import ABTestConfig, ABTestResult, ABTester
from agentkit.evolution.evolution_store import InMemoryEvolutionStore


def _make_config(test_id: str = "test-001", min_samples: int = 10) -> ABTestConfig:
    return ABTestConfig(
        test_id=test_id,
        agent_name="test_agent",
        change_type="prompt",
        min_samples=min_samples,
    )


# ── Hash-based deterministic group assignment ──────────────────


class TestHashBasedAssignment:
    """测试 hash-based 确定性分组"""

    def test_same_task_id_same_group(self):
        """同一 task_id 总是分配到同一组"""
        tester = ABTester()
        tester.create_test(_make_config())

        group1 = tester.assign_group("test-001", task_id="task-abc")
        group2 = tester.assign_group("test-001", task_id="task-abc")
        assert group1 == group2

    def test_different_task_ids_may_differ(self):
        """不同 task_id 可能分配到不同组"""
        tester = ABTester()
        tester.create_test(_make_config())

        groups = set()
        for i in range(20):
            group = tester.assign_group("test-001", task_id=f"task-{i}")
            groups.add(group)

        # With 20 different task_ids, we should see both groups
        assert len(groups) == 2

    def test_no_test_returns_control(self):
        """不存在的 test_id 返回 control"""
        tester = ABTester()
        group = tester.assign_group("nonexistent", task_id="task-1")
        assert group == "control"

    def test_deterministic_across_instances(self):
        """不同 ABTester 实例对同一 task_id 分配结果一致"""
        tester1 = ABTester()
        tester1.create_test(_make_config())

        tester2 = ABTester()
        tester2.create_test(_make_config())

        for i in range(10):
            g1 = tester1.assign_group("test-001", task_id=f"task-{i}")
            g2 = tester2.assign_group("test-001", task_id=f"task-{i}")
            assert g1 == g2


# ── Min samples configuration ──────────────────────────────────


class TestMinSamples:
    """测试最小样本量配置"""

    def test_default_min_samples(self):
        """默认 min_samples 为 10"""
        tester = ABTester()
        assert tester._default_min_samples == 10

    def test_custom_min_samples(self):
        """自定义 min_samples"""
        tester = ABTester(min_samples=5)
        assert tester._default_min_samples == 5

    @pytest.mark.asyncio
    async def test_insufficient_samples_not_significant(self):
        """样本不足时结果不显著"""
        tester = ABTester(min_samples=5)
        tester.create_test(_make_config(min_samples=5))

        # Add only 3 results per group
        for i in range(3):
            tester.record_result("test-001", "control", 0.5)
            tester.record_result("test-001", "experiment", 0.8)

        result = await tester.evaluate("test-001")
        assert result is not None
        assert result.is_significant is False
        assert result.winner is None

    @pytest.mark.asyncio
    async def test_sufficient_samples_can_be_significant(self):
        """样本充足时结果可以显著"""
        tester = ABTester(min_samples=5)
        tester.create_test(_make_config(min_samples=5))

        # Add 10 results per group with clear difference
        for i in range(10):
            tester.record_result("test-001", "control", 0.3)
            tester.record_result("test-001", "experiment", 0.9)

        result = await tester.evaluate("test-001")
        assert result is not None
        assert result.is_significant is True
        assert result.winner == "experiment"


# ── Persistence ────────────────────────────────────────────────


class TestPersistence:
    """测试结果持久化"""

    @pytest.mark.asyncio
    async def test_persist_results_to_store(self):
        """结果持久化到 EvolutionStore"""
        store = InMemoryEvolutionStore()
        tester = ABTester(evolution_store=store, min_samples=10)
        tester.create_test(_make_config())

        # Add some results
        tester.record_result("test-001", "control", 0.5)
        tester.record_result("test-001", "experiment", 0.8)

        await tester.persist_results("test-001")

        # Check store has the results
        stored = await store.get_ab_test_results("test-001")
        assert len(stored) == 2
        variants = {r["variant"] for r in stored}
        assert variants == {"control", "experiment"}

    @pytest.mark.asyncio
    async def test_persist_without_store_is_noop(self):
        """没有 EvolutionStore 时持久化是无操作"""
        tester = ABTester(min_samples=10)
        tester.create_test(_make_config())
        tester.record_result("test-001", "control", 0.5)

        # Should not raise
        await tester.persist_results("test-001")

    @pytest.mark.asyncio
    async def test_persist_empty_results_is_noop(self):
        """没有结果时持久化是无操作"""
        store = InMemoryEvolutionStore()
        tester = ABTester(evolution_store=store, min_samples=10)
        tester.create_test(_make_config())

        # No results recorded yet
        await tester.persist_results("test-001")

        stored = await store.get_ab_test_results("test-001")
        assert len(stored) == 0


# ── Evaluate ───────────────────────────────────────────────────


class TestEvaluate:
    """测试评估逻辑"""

    @pytest.mark.asyncio
    async def test_evaluate_nonexistent_test(self):
        """评估不存在的测试返回 None"""
        tester = ABTester()
        result = await tester.evaluate("nonexistent")
        assert result is None

    @pytest.mark.asyncio
    async def test_evaluate_experiment_wins(self):
        """实验组获胜时 winner 为 experiment"""
        tester = ABTester(min_samples=5)
        tester.create_test(_make_config(min_samples=5))

        for i in range(10):
            tester.record_result("test-001", "control", 0.3)
            tester.record_result("test-001", "experiment", 0.9)

        result = await tester.evaluate("test-001")
        assert result is not None
        assert result.winner == "experiment"
        assert result.experiment_metric > result.control_metric

    @pytest.mark.asyncio
    async def test_evaluate_control_wins(self):
        """对照组获胜时 winner 为 control"""
        tester = ABTester(min_samples=5)
        tester.create_test(_make_config(min_samples=5))

        for i in range(10):
            tester.record_result("test-001", "control", 0.9)
            tester.record_result("test-001", "experiment", 0.3)

        result = await tester.evaluate("test-001")
        assert result is not None
        assert result.winner == "control"
        assert result.control_metric > result.experiment_metric