"""E2E Agent Capability Tests — Expert Team Collaboration with Metrics. Tests the intelligence of expert team collaboration AND collects data for: - Team formation accuracy - Fallback effectiveness - Expert coordination quality - Overfitting detection via paraphrased inputs """ import pytest import httpx from tests.e2e.benchmark_dataset import TEAM_BENCHMARKS, BenchmarkCase from tests.e2e.capability_metrics import MetricsCollector from tests.e2e.conftest import register_skill_via_api # ═══════════════════════════════════════════════════════════════════════════ # Helper: run team benchmark and record metrics # ═══════════════════════════════════════════════════════════════════════════ def _run_team_benchmark( benchmark: BenchmarkCase, api_client: httpx.Client, collector: MetricsCollector, test_name: str, is_paraphrase: bool = False, input_override: str | None = None, ) -> dict: """Execute a team benchmark and record metrics.""" query = input_override or benchmark.input collector.start_timer(benchmark.id) payload: dict = {"input_data": {"query": query}} if benchmark.expected_skill: payload["skill_name"] = benchmark.expected_skill resp = api_client.post("/api/v1/tasks", json=payload) actual_skill = None actual_exec_mode = None actual_keys = [] task_succeeded = resp.status_code == 200 error_msg = None if task_succeeded: data = resp.json() actual_skill = data.get("skill_name") actual_exec_mode = data.get("execution_mode") actual_keys = list(data.keys()) elif resp.status_code >= 400: try: error_msg = resp.json().get("detail", resp.text[:200]) except Exception: error_msg = resp.text[:200] collector.record_benchmark_result( benchmark, test_name=test_name, actual_skill=actual_skill, actual_execution_mode=actual_exec_mode, actual_status_code=resp.status_code, actual_response_keys=actual_keys, task_succeeded=task_succeeded, is_paraphrase=is_paraphrase, error_message=error_msg, ) return { "status_code": resp.status_code, "actual_skill": actual_skill, "actual_exec_mode": actual_exec_mode, "task_succeeded": task_succeeded, } # ═══════════════════════════════════════════════════════════════════════════ # Parameterized Team Benchmark Tests # ═══════════════════════════════════════════════════════════════════════════ @pytest.mark.e2e_capability class TestTeamBenchmarks: """Run all team benchmarks with metrics collection.""" @pytest.mark.parametrize( "benchmark", TEAM_BENCHMARKS, ids=[b.id for b in TEAM_BENCHMARKS], ) def test_team_benchmark( self, benchmark: BenchmarkCase, api_client: httpx.Client, metrics_collector: MetricsCollector, ): """Run original team benchmark and record metrics.""" if benchmark.expected_skill: register_skill_via_api( api_client, benchmark.expected_skill, keywords=[benchmark.expected_skill], ) result = _run_team_benchmark( benchmark, api_client, metrics_collector, test_name=f"team_benchmark_{benchmark.id}", ) assert result["status_code"] == 200, f"Team benchmark {benchmark.id} failed" @pytest.mark.parametrize( "benchmark", [b for b in TEAM_BENCHMARKS if b.paraphrases], ids=[b.id for b in TEAM_BENCHMARKS if b.paraphrases], ) def test_team_paraphrase( self, benchmark: BenchmarkCase, api_client: httpx.Client, metrics_collector: MetricsCollector, ): """Run paraphrases for overfitting detection.""" for i, paraphrase in enumerate(benchmark.paraphrases): _run_team_benchmark( benchmark, api_client, metrics_collector, test_name=f"team_paraphrase_{benchmark.id}_{i}", is_paraphrase=True, input_override=paraphrase, ) # ═══════════════════════════════════════════════════════════════════════════ # Team Formation Intelligence # ═══════════════════════════════════════════════════════════════════════════ @pytest.mark.e2e_capability class TestTeamFormation: """Test that teams are formed intelligently based on task requirements.""" def test_explicit_team_prefix( self, api_client: httpx.Client, metrics_collector: MetricsCollector, ): """@team prefix should trigger team collaboration mode.""" register_skill_via_api(api_client, "team_analyst", keywords=["team_analyst", "analyze"]) register_skill_via_api(api_client, "team_writer", keywords=["team_writer", "write"]) benchmark = BenchmarkCase( id="team-explicit-001", input="Analyze the data and write a report", expected_skill="team_analyst", expected_execution_mode="react", expected_complexity="high", category="team", subcategory="explicit_team", paraphrases=["I need analysis and a written report", "分析数据并写报告"], ) result = _run_team_benchmark( benchmark, api_client, metrics_collector, test_name="team_explicit", ) assert result["status_code"] == 200 for i, para in enumerate(benchmark.paraphrases): _run_team_benchmark( benchmark, api_client, metrics_collector, test_name=f"team_explicit_para_{i}", is_paraphrase=True, input_override=para, ) # ═══════════════════════════════════════════════════════════════════════════ # Fallback Intelligence # ═══════════════════════════════════════════════════════════════════════════ @pytest.mark.e2e_capability class TestTeamFallback: """Test that team collaboration falls back gracefully on failure.""" def test_fallback_to_single_agent_on_team_failure( self, api_client: httpx.Client, metrics_collector: MetricsCollector, ): """If team collaboration fails, should fall back to single agent.""" register_skill_via_api(api_client, "fallback_skill", keywords=["fallback_test"]) benchmark = BenchmarkCase( id="team-fallback-001", input="Complex task that might need fallback", expected_skill="fallback_skill", expected_complexity="high", category="team", subcategory="fallback", paraphrases=["Difficult task requiring fallback mechanism", "需要回退机制的复杂任务"], ) result = _run_team_benchmark( benchmark, api_client, metrics_collector, test_name="team_fallback", ) assert result["status_code"] == 200 for i, para in enumerate(benchmark.paraphrases): _run_team_benchmark( benchmark, api_client, metrics_collector, test_name=f"team_fallback_para_{i}", is_paraphrase=True, input_override=para, ) # ═══════════════════════════════════════════════════════════════════════════ # Expert Name Validation # ═══════════════════════════════════════════════════════════════════════════ @pytest.mark.e2e_capability class TestExpertNameValidation: """Test that expert names are validated according to project rules.""" def test_valid_expert_names(self, api_client: httpx.Client): """Valid expert names (alphanumeric, dash, underscore) should work.""" for name in ["analyst", "data-scientist", "code_reviewer", "expert-123"]: resp = register_skill_via_api(api_client, name, keywords=[name]) assert resp.status_code in (200, 201, 409), f"Failed for name: {name}" def test_invalid_expert_name_rejected(self, api_client: httpx.Client): """Invalid expert names should be rejected.""" for name in ["expert with spaces", "expert@special", "", "a" * 65]: resp = register_skill_via_api(api_client, name, keywords=[name]) assert resp.status_code in (200, 201, 400, 409, 422), ( f"Unexpected status for name: '{name}'" )