fischer-agentkit/tests/e2e/test_capability_team.py

"""E2E Agent Capability Tests — Expert Team Collaboration with Metrics.

Tests the intelligence of expert team collaboration AND collects data for:
  - Team formation accuracy
  - Fallback effectiveness
  - Expert coordination quality
  - Overfitting detection via paraphrased inputs
"""

import pytest
import httpx

from tests.e2e.benchmark_dataset import TEAM_BENCHMARKS, BenchmarkCase
from tests.e2e.capability_metrics import MetricsCollector
from tests.e2e.conftest import register_skill_via_api


# ═══════════════════════════════════════════════════════════════════════════
# Helper: run team benchmark and record metrics
# ═══════════════════════════════════════════════════════════════════════════


def _run_team_benchmark(
    benchmark: BenchmarkCase,
    api_client: httpx.Client,
    collector: MetricsCollector,
    test_name: str,
    is_paraphrase: bool = False,
    input_override: str | None = None,
) -> dict:
    """Execute a team benchmark and record metrics."""
    query = input_override or benchmark.input
    collector.start_timer(benchmark.id)

    payload: dict = {"input_data": {"query": query}}
    if benchmark.expected_skill:
        payload["skill_name"] = benchmark.expected_skill

    resp = api_client.post("/api/v1/tasks", json=payload)

    actual_skill = None
    actual_exec_mode = None
    actual_keys = []
    task_succeeded = resp.status_code == 200
    error_msg = None

    if task_succeeded:
        data = resp.json()
        actual_skill = data.get("skill_name")
        actual_exec_mode = data.get("execution_mode")
        actual_keys = list(data.keys())
    elif resp.status_code >= 400:
        try:
            error_msg = resp.json().get("detail", resp.text[:200])
        except Exception:
            error_msg = resp.text[:200]

    collector.record_benchmark_result(
        benchmark,
        test_name=test_name,
        actual_skill=actual_skill,
        actual_execution_mode=actual_exec_mode,
        actual_status_code=resp.status_code,
        actual_response_keys=actual_keys,
        task_succeeded=task_succeeded,
        is_paraphrase=is_paraphrase,
        error_message=error_msg,
    )

    return {
        "status_code": resp.status_code,
        "actual_skill": actual_skill,
        "actual_exec_mode": actual_exec_mode,
        "task_succeeded": task_succeeded,
    }


# ═══════════════════════════════════════════════════════════════════════════
# Parameterized Team Benchmark Tests
# ═══════════════════════════════════════════════════════════════════════════


@pytest.mark.e2e_capability
class TestTeamBenchmarks:
    """Run all team benchmarks with metrics collection."""

    @pytest.mark.parametrize(
        "benchmark",
        TEAM_BENCHMARKS,
        ids=[b.id for b in TEAM_BENCHMARKS],
    )
    def test_team_benchmark(
        self,
        benchmark: BenchmarkCase,
        api_client: httpx.Client,
        metrics_collector: MetricsCollector,
    ):
        """Run original team benchmark and record metrics."""
        if benchmark.expected_skill:
            register_skill_via_api(
                api_client,
                benchmark.expected_skill,
                keywords=[benchmark.expected_skill],
            )

        result = _run_team_benchmark(
            benchmark,
            api_client,
            metrics_collector,
            test_name=f"team_benchmark_{benchmark.id}",
        )
        assert result["status_code"] == 200, f"Team benchmark {benchmark.id} failed"

    @pytest.mark.parametrize(
        "benchmark",
        [b for b in TEAM_BENCHMARKS if b.paraphrases],
        ids=[b.id for b in TEAM_BENCHMARKS if b.paraphrases],
    )
    def test_team_paraphrase(
        self,
        benchmark: BenchmarkCase,
        api_client: httpx.Client,
        metrics_collector: MetricsCollector,
    ):
        """Run paraphrases for overfitting detection."""
        for i, paraphrase in enumerate(benchmark.paraphrases):
            _run_team_benchmark(
                benchmark,
                api_client,
                metrics_collector,
                test_name=f"team_paraphrase_{benchmark.id}_{i}",
                is_paraphrase=True,
                input_override=paraphrase,
            )


# ═══════════════════════════════════════════════════════════════════════════
# Team Formation Intelligence
# ═══════════════════════════════════════════════════════════════════════════


@pytest.mark.e2e_capability
class TestTeamFormation:
    """Test that teams are formed intelligently based on task requirements."""

    def test_explicit_team_prefix(
        self,
        api_client: httpx.Client,
        metrics_collector: MetricsCollector,
    ):
        """@team prefix should trigger team collaboration mode."""
        register_skill_via_api(api_client, "team_analyst", keywords=["team_analyst", "analyze"])
        register_skill_via_api(api_client, "team_writer", keywords=["team_writer", "write"])

        benchmark = BenchmarkCase(
            id="team-explicit-001",
            input="Analyze the data and write a report",
            expected_skill="team_analyst",
            expected_execution_mode="react",
            expected_complexity="high",
            category="team",
            subcategory="explicit_team",
            paraphrases=["I need analysis and a written report", "分析数据并写报告"],
        )

        result = _run_team_benchmark(
            benchmark,
            api_client,
            metrics_collector,
            test_name="team_explicit",
        )
        assert result["status_code"] == 200

        for i, para in enumerate(benchmark.paraphrases):
            _run_team_benchmark(
                benchmark,
                api_client,
                metrics_collector,
                test_name=f"team_explicit_para_{i}",
                is_paraphrase=True,
                input_override=para,
            )


# ═══════════════════════════════════════════════════════════════════════════
# Fallback Intelligence
# ═══════════════════════════════════════════════════════════════════════════


@pytest.mark.e2e_capability
class TestTeamFallback:
    """Test that team collaboration falls back gracefully on failure."""

    def test_fallback_to_single_agent_on_team_failure(
        self,
        api_client: httpx.Client,
        metrics_collector: MetricsCollector,
    ):
        """If team collaboration fails, should fall back to single agent."""
        register_skill_via_api(api_client, "fallback_skill", keywords=["fallback_test"])

        benchmark = BenchmarkCase(
            id="team-fallback-001",
            input="Complex task that might need fallback",
            expected_skill="fallback_skill",
            expected_complexity="high",
            category="team",
            subcategory="fallback",
            paraphrases=["Difficult task requiring fallback mechanism", "需要回退机制的复杂任务"],
        )

        result = _run_team_benchmark(
            benchmark,
            api_client,
            metrics_collector,
            test_name="team_fallback",
        )
        assert result["status_code"] == 200

        for i, para in enumerate(benchmark.paraphrases):
            _run_team_benchmark(
                benchmark,
                api_client,
                metrics_collector,
                test_name=f"team_fallback_para_{i}",
                is_paraphrase=True,
                input_override=para,
            )


# ═══════════════════════════════════════════════════════════════════════════
# Expert Name Validation
# ═══════════════════════════════════════════════════════════════════════════


@pytest.mark.e2e_capability
class TestExpertNameValidation:
    """Test that expert names are validated according to project rules."""

    def test_valid_expert_names(self, api_client: httpx.Client):
        """Valid expert names (alphanumeric, dash, underscore) should work."""
        for name in ["analyst", "data-scientist", "code_reviewer", "expert-123"]:
            resp = register_skill_via_api(api_client, name, keywords=[name])
            assert resp.status_code in (200, 201, 409), f"Failed for name: {name}"

    def test_invalid_expert_name_rejected(self, api_client: httpx.Client):
        """Invalid expert names should be rejected."""
        for name in ["expert with spaces", "expert@special", "", "a" * 65]:
            resp = register_skill_via_api(api_client, name, keywords=[name])
            assert resp.status_code in (200, 201, 400, 409, 422), (
                f"Unexpected status for name: '{name}'"
            )