"""E2E Agent Capability Tests — Router Direct Backtest Layer (Real LLM). Directly tests CostAwareRouter.route() using real LLM configuration loaded from agentkit.yaml. Records full SkillRoutingResult for precise root cause analysis: - match_method (layer0/layer1/layer1.5/layer2) - match_confidence - complexity score - execution_trace """ import asyncio import os from pathlib import Path import pytest from agentkit.chat.skill_routing import CostAwareRouter from agentkit.router.intent import IntentRouter from agentkit.server.app import _build_llm_gateway, _build_skill_registry from agentkit.server.config import ServerConfig from agentkit.skills.registry import SkillRegistry from tests.e2e.benchmark_dataset import ( ALL_BENCHMARKS, ROUTING_KEYWORD_BENCHMARKS, ROUTING_EDGE_BENCHMARKS, SEMANTIC_ROUTER_BENCHMARKS, BenchmarkCase, ) from tests.e2e.capability_metrics import MetricsCollector # ═══════════════════════════════════════════════════════════════════════════ # Real component initialization from agentkit.yaml # ═══════════════════════════════════════════════════════════════════════════ def _find_config_path() -> str | None: """Find agentkit.yaml in standard search paths.""" candidates = [ os.environ.get("AGENTKIT_CONFIG", ""), str(Path.cwd() / "agentkit.yaml"), str(Path.home() / ".agentkit" / "agentkit.yaml"), ] for path in candidates: if path and Path(path).is_file(): return path return None def _build_real_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRouter]: """Build real components from agentkit.yaml configuration. Returns (router, skill_registry, intent_router). Raises skip if no valid LLM provider is configured. """ config_path = _find_config_path() if not config_path: pytest.skip("No agentkit.yaml found — cannot build real components") # Load .env if present env_path = Path(config_path).parent / ".env" if env_path.exists(): try: from dotenv import load_dotenv load_dotenv(env_path) except ImportError: # python-dotenv not installed, manually parse .env with open(env_path) as f: for line in f: line = line.strip() if line and not line.startswith("#") and "=" in line: key, _, value = line.partition("=") os.environ.setdefault(key.strip(), value.strip().strip("'\"")) server_config = ServerConfig.from_yaml(config_path) # Check if any LLM provider has a valid API key if not server_config.has_llm_provider(): # Try to inject DASHSCOPE_API_KEY from environment dashscope_key = os.environ.get("DASHSCOPE_API_KEY", "") if dashscope_key: # Inject into the test provider config for name, pconf in server_config.llm_config.providers.items(): if not pconf.api_key: pconf.api_key = dashscope_key # Set base_url for dashscope if missing if not pconf.base_url: pconf.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1" break if not server_config.has_llm_provider(): pytest.skip("No LLM provider with valid API key — skipping real LLM tests") # Build real LLM gateway llm_gateway = _build_llm_gateway(server_config) # Build real skill registry from configs/skills skill_registry = _build_skill_registry(server_config) # Build real intent router intent_router = IntentRouter(llm_gateway=llm_gateway) # Build real CostAwareRouter router_conf = server_config.router or {} router = CostAwareRouter( llm_gateway=llm_gateway, model="default", org_context=None, auction_enabled=router_conf.get("auction_enabled", False), classifier=router_conf.get("classifier", "heuristic"), merged_llm_classify=router_conf.get("merged_llm_classify", True), ) return router, skill_registry, intent_router # Cache components at module level to avoid rebuilding for every test _cached_components: tuple[CostAwareRouter, SkillRegistry, IntentRouter] | None = None def _get_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRouter]: """Get or build real components (cached for session).""" global _cached_components if _cached_components is None: _cached_components = _build_real_components() return _cached_components # ═══════════════════════════════════════════════════════════════════════════ # Helper: Run a single benchmark through the real router # ═══════════════════════════════════════════════════════════════════════════ async def _run_router_benchmark( benchmark: BenchmarkCase, collector: MetricsCollector, test_name: str, is_paraphrase: bool = False, input_override: str | None = None, ) -> dict: """Run a single benchmark through the real router.""" router, skill_registry, intent_router = _get_components() query = input_override or benchmark.input collector.start_timer(benchmark.id) try: result = await router.route( content=query, skill_registry=skill_registry, intent_router=intent_router, default_tools=[], default_system_prompt=None, ) actual_skill = result.skill_name actual_exec_mode = result.execution_mode.value if result.execution_mode else None actual_complexity = result.complexity actual_match_method = result.match_method actual_match_confidence = result.match_confidence task_succeeded = True error_msg = None except Exception as e: actual_skill = None actual_exec_mode = None actual_complexity = 0.0 actual_match_method = None actual_match_confidence = 0.0 task_succeeded = False error_msg = str(e)[:200] # Map complexity score to level if actual_complexity < 0.3: actual_complexity_level = "low" elif actual_complexity < 0.7: actual_complexity_level = "medium" else: actual_complexity_level = "high" # Judge correctness skill_correct = None if benchmark.expected_skill is not None and actual_skill is not None: skill_correct = actual_skill == benchmark.expected_skill elif benchmark.expected_skill is None: skill_correct = actual_skill is None or task_succeeded execution_mode_correct = None if actual_exec_mode is not None and benchmark.expected_execution_mode: mode_map = { "direct": "DIRECT_CHAT", "react": "SKILL_REACT", "rewoo": "REWOO", "reflexion": "REFLEXION", "plan_exec": "PLAN_EXEC", "team_collab": "TEAM_COLLAB", "llm_generate": "SKILL_REACT", "tool_call": "SKILL_REACT", "custom": "SKILL_REACT", } expected_normalized = mode_map.get( benchmark.expected_execution_mode, benchmark.expected_execution_mode.upper() ) execution_mode_correct = actual_exec_mode.upper() == expected_normalized complexity_correct = actual_complexity_level == benchmark.expected_complexity obs = collector.record_benchmark_result( benchmark, test_name=test_name, actual_skill=actual_skill, actual_execution_mode=actual_exec_mode, actual_status_code=200 if task_succeeded else 500, task_succeeded=task_succeeded, is_paraphrase=is_paraphrase, error_message=error_msg, ) obs.complexity_correct = complexity_correct return { "skill_correct": skill_correct, "execution_mode_correct": execution_mode_correct, "complexity_correct": complexity_correct, "actual_skill": actual_skill, "actual_exec_mode": actual_exec_mode, "actual_complexity": actual_complexity, "actual_complexity_level": actual_complexity_level, "actual_match_method": actual_match_method, "actual_match_confidence": actual_match_confidence, "task_succeeded": task_succeeded, } # ═══════════════════════════════════════════════════════════════════════════ # Layer 0: Rule Matching Tests # ═══════════════════════════════════════════════════════════════════════════ @pytest.mark.e2e_capability class TestRouterLayer0: """Test Layer 0 rule matching with real router.""" @pytest.mark.parametrize( "benchmark", [ b for b in ROUTING_EDGE_BENCHMARKS if b.subcategory in ("greeting", "identity", "explicit_prefix") ], ids=[ b.id for b in ROUTING_EDGE_BENCHMARKS if b.subcategory in ("greeting", "identity", "explicit_prefix") ], ) def test_layer0_rules(self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector): """Layer 0 should correctly match greetings, identity, and @skill: prefix.""" result = asyncio.run( _run_router_benchmark(benchmark, metrics_collector, f"layer0_{benchmark.id}") ) if benchmark.subcategory == "greeting": assert result["actual_match_method"] in ("layer0", None) or result["task_succeeded"] if benchmark.subcategory == "explicit_prefix": assert result["actual_skill"] == benchmark.expected_skill or result["task_succeeded"] # ═══════════════════════════════════════════════════════════════════════════ # Layer 1: Complexity Classification Tests # ═══════════════════════════════════════════════════════════════════════════ @pytest.mark.e2e_capability class TestRouterLayer1: """Test Layer 1 complexity classification with real router.""" @pytest.mark.parametrize( "benchmark", ROUTING_KEYWORD_BENCHMARKS, ids=[b.id for b in ROUTING_KEYWORD_BENCHMARKS], ) def test_complexity_classification( self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector ): """HeuristicClassifier should correctly estimate complexity.""" asyncio.run(_run_router_benchmark(benchmark, metrics_collector, f"layer1_{benchmark.id}")) # ═══════════════════════════════════════════════════════════════════════════ # Semantic Router Tests # ═══════════════════════════════════════════════════════════════════════════ @pytest.mark.e2e_capability class TestSemanticRouter: """Test semantic router matching with real router.""" @pytest.mark.parametrize( "benchmark", SEMANTIC_ROUTER_BENCHMARKS, ids=[b.id for b in SEMANTIC_ROUTER_BENCHMARKS], ) def test_semantic_match(self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector): """SemanticRouter should match skill descriptions.""" asyncio.run(_run_router_benchmark(benchmark, metrics_collector, f"semantic_{benchmark.id}")) # ═══════════════════════════════════════════════════════════════════════════ # Paraphrase Consistency Tests (Overfitting Detection) # ═══════════════════════════════════════════════════════════════════════════ @pytest.mark.e2e_capability class TestRouterParaphraseConsistency: """Test that paraphrased inputs route to the same skill as originals.""" @pytest.mark.parametrize( "benchmark", [b for b in ALL_BENCHMARKS if b.paraphrases and b.expected_skill is not None][:10], ids=[b.id for b in ALL_BENCHMARKS if b.paraphrases and b.expected_skill is not None][:10], ) def test_paraphrase_routes_same_skill( self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector ): """Original and paraphrased inputs should route to the same skill.""" # Run original asyncio.run( _run_router_benchmark(benchmark, metrics_collector, f"para_orig_{benchmark.id}") ) # Run paraphrases for i, para in enumerate(benchmark.paraphrases): asyncio.run( _run_router_benchmark( benchmark, metrics_collector, f"para_{benchmark.id}_{i}", is_paraphrase=True, input_override=para, ) )