"""E2E Agent Capability Tests — Intent Routing Intelligence with Metrics Collection. Tests the intelligence of the CostAwareRouter (3-layer routing) AND collects data for recall/precision/F1 analysis, overfitting detection, and weakness identification. Each test: 1. Runs the benchmark case (original input) 2. Runs all paraphrases of the same input (overfitting detection) 3. Records observations to MetricsCollector 4. Asserts minimum quality thresholds """ import pytest import httpx from tests.e2e.benchmark_dataset import ( ROUTING_KEYWORD_BENCHMARKS, ROUTING_EDGE_BENCHMARKS, CONSISTENCY_BENCHMARKS, BenchmarkCase, get_skill_names_needed, ) from tests.e2e.capability_metrics import MetricsCollector from tests.e2e.conftest import register_skill_via_api # ═══════════════════════════════════════════════════════════════════════════ # Pre-registration of all skills needed by benchmarks # ═══════════════════════════════════════════════════════════════════════════ @pytest.fixture(autouse=True, scope="module") def register_benchmark_skills(api_client: httpx.Client): """Auto-register all skills needed by routing benchmarks.""" for skill_name in get_skill_names_needed(): register_skill_via_api(api_client, skill_name, keywords=[skill_name]) # ═══════════════════════════════════════════════════════════════════════════ # Helper: run a single benchmark case and record metrics # ═══════════════════════════════════════════════════════════════════════════ def _run_benchmark_and_record( benchmark: BenchmarkCase, api_client: httpx.Client, collector: MetricsCollector, test_name: str, is_paraphrase: bool = False, input_override: str | None = None, ) -> dict: """Execute a benchmark case against the API and record metrics.""" query = input_override or benchmark.input collector.start_timer(benchmark.id) payload: dict = {"input_data": {"query": query}} if benchmark.expected_skill is not None: payload["skill_name"] = benchmark.expected_skill resp = api_client.post("/api/v1/tasks", json=payload) actual_skill = None actual_exec_mode = None actual_keys = [] task_succeeded = resp.status_code == 200 error_msg = None if task_succeeded: data = resp.json() actual_skill = data.get("skill_name") actual_exec_mode = data.get("execution_mode") actual_keys = list(data.keys()) elif resp.status_code >= 400: try: error_msg = resp.json().get("detail", resp.text[:200]) except Exception: error_msg = resp.text[:200] collector.record_benchmark_result( benchmark, test_name=test_name, actual_skill=actual_skill, actual_execution_mode=actual_exec_mode, actual_status_code=resp.status_code, actual_response_keys=actual_keys, task_succeeded=task_succeeded, is_paraphrase=is_paraphrase, error_message=error_msg, ) return { "status_code": resp.status_code, "actual_skill": actual_skill, "actual_exec_mode": actual_exec_mode, "task_succeeded": task_succeeded, } # ═══════════════════════════════════════════════════════════════════════════ # Parameterized Routing Benchmark Tests # ═══════════════════════════════════════════════════════════════════════════ @pytest.mark.e2e_capability class TestRoutingBenchmarks: """Run all routing benchmarks with metrics collection.""" @pytest.mark.parametrize( "benchmark", ROUTING_KEYWORD_BENCHMARKS + ROUTING_EDGE_BENCHMARKS, ids=[b.id for b in ROUTING_KEYWORD_BENCHMARKS + ROUTING_EDGE_BENCHMARKS], ) def test_routing_benchmark( self, benchmark: BenchmarkCase, api_client: httpx.Client, metrics_collector: MetricsCollector, ): """Run original benchmark input and record metrics.""" result = _run_benchmark_and_record( benchmark, api_client, metrics_collector, test_name=f"routing_benchmark_{benchmark.id}", ) assert result["status_code"] == 200, f"Benchmark {benchmark.id} failed: {result}" @pytest.mark.parametrize( "benchmark", [b for b in ROUTING_KEYWORD_BENCHMARKS + ROUTING_EDGE_BENCHMARKS if b.paraphrases], ids=[b.id for b in ROUTING_KEYWORD_BENCHMARKS + ROUTING_EDGE_BENCHMARKS if b.paraphrases], ) def test_routing_paraphrase( self, benchmark: BenchmarkCase, api_client: httpx.Client, metrics_collector: MetricsCollector, ): """Run all paraphrases for overfitting detection.""" for i, paraphrase in enumerate(benchmark.paraphrases): _run_benchmark_and_record( benchmark, api_client, metrics_collector, test_name=f"routing_paraphrase_{benchmark.id}_{i}", is_paraphrase=True, input_override=paraphrase, ) # ═══════════════════════════════════════════════════════════════════════════ # Routing Consistency (same input, multiple runs) # ═══════════════════════════════════════════════════════════════════════════ @pytest.mark.e2e_capability class TestRoutingConsistency: """Same input should produce same routing decision (deterministic backtest).""" def test_same_query_same_skill( self, api_client: httpx.Client, metrics_collector: MetricsCollector, ): """Submitting the same query multiple times should route to the same skill.""" for benchmark in CONSISTENCY_BENCHMARKS: skills_seen: list[str | None] = [] for run_idx in range(3): result = _run_benchmark_and_record( benchmark, api_client, metrics_collector, test_name=f"consistency_{benchmark.id}_run{run_idx}", ) skills_seen.append(result["actual_skill"]) # All runs should produce the same skill non_none_skills = [s for s in skills_seen if s is not None] if len(non_none_skills) >= 2: assert len(set(non_none_skills)) == 1, ( f"Inconsistent routing for {benchmark.id}: {skills_seen}" ) # ═══════════════════════════════════════════════════════════════════════════ # Routing Disambiguation (specific edge cases) # ═══════════════════════════════════════════════════════════════════════════ @pytest.mark.e2e_capability class TestRoutingDisambiguation: """When multiple skills could match, the router should pick the best one.""" def test_overlapping_keywords_routes_to_best_match( self, api_client: httpx.Client, metrics_collector: MetricsCollector, ): """With overlapping keywords, router should pick the most relevant skill.""" register_skill_via_api( api_client, "python_coder", keywords=["python", "code", "programming"], ) register_skill_via_api( api_client, "javascript_coder", keywords=["javascript", "code", "programming"], ) benchmark = BenchmarkCase( id="disambig-python-001", input="Write a Python function to sort a list", expected_skill="python_coder", expected_complexity="medium", category="routing", subcategory="disambiguation", paraphrases=["I need a Python sorting algorithm", "用Python写个排序函数"], ) result = _run_benchmark_and_record( benchmark, api_client, metrics_collector, test_name="disambig_python", ) assert result["status_code"] == 200 # Also test paraphrases for overfitting detection for i, para in enumerate(benchmark.paraphrases): _run_benchmark_and_record( benchmark, api_client, metrics_collector, test_name=f"disambig_python_para_{i}", is_paraphrase=True, input_override=para, ) def test_no_matching_skill_falls_back_gracefully( self, api_client: httpx.Client, metrics_collector: MetricsCollector, ): """When no skill matches, should fall back to direct chat.""" benchmark = BenchmarkCase( id="fallback-nomatch-001", input="Tell me about quantum physics", expected_skill=None, expected_complexity="low", category="routing", subcategory="fallback", paraphrases=["Explain quantum mechanics", "量子物理是什么"], ) result = _run_benchmark_and_record( benchmark, api_client, metrics_collector, test_name="fallback_nomatch", ) assert result["status_code"] == 200 for i, para in enumerate(benchmark.paraphrases): _run_benchmark_and_record( benchmark, api_client, metrics_collector, test_name=f"fallback_nomatch_para_{i}", is_paraphrase=True, input_override=para, )