fischer-agentkit/tests/e2e/test_capability_router_dire...

343 lines
14 KiB
Python

"""E2E Agent Capability Tests — Router Direct Backtest Layer (Real LLM).
Directly tests CostAwareRouter.route() using real LLM configuration
loaded from agentkit.yaml. Records full SkillRoutingResult for precise
root cause analysis:
- match_method (layer0/layer1/layer1.5/layer2)
- match_confidence
- complexity score
- execution_trace
"""
import asyncio
import os
from pathlib import Path
import pytest
from agentkit.chat.skill_routing import CostAwareRouter
from agentkit.router.intent import IntentRouter
from agentkit.server.app import _build_llm_gateway, _build_skill_registry
from agentkit.server.config import ServerConfig
from agentkit.skills.registry import SkillRegistry
from tests.e2e.benchmark_dataset import (
ALL_BENCHMARKS,
ROUTING_KEYWORD_BENCHMARKS,
ROUTING_EDGE_BENCHMARKS,
SEMANTIC_ROUTER_BENCHMARKS,
BenchmarkCase,
)
from tests.e2e.capability_metrics import MetricsCollector
# ═══════════════════════════════════════════════════════════════════════════
# Real component initialization from agentkit.yaml
# ═══════════════════════════════════════════════════════════════════════════
def _find_config_path() -> str | None:
"""Find agentkit.yaml in standard search paths."""
candidates = [
os.environ.get("AGENTKIT_CONFIG", ""),
str(Path.cwd() / "agentkit.yaml"),
str(Path.home() / ".agentkit" / "agentkit.yaml"),
]
for path in candidates:
if path and Path(path).is_file():
return path
return None
def _build_real_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRouter]:
"""Build real components from agentkit.yaml configuration.
Returns (router, skill_registry, intent_router).
Raises skip if no valid LLM provider is configured.
"""
config_path = _find_config_path()
if not config_path:
pytest.skip("No agentkit.yaml found — cannot build real components")
# Load .env if present
env_path = Path(config_path).parent / ".env"
if env_path.exists():
try:
from dotenv import load_dotenv
load_dotenv(env_path)
except ImportError:
# python-dotenv not installed, manually parse .env
with open(env_path) as f:
for line in f:
line = line.strip()
if line and not line.startswith("#") and "=" in line:
key, _, value = line.partition("=")
os.environ.setdefault(key.strip(), value.strip().strip("'\""))
server_config = ServerConfig.from_yaml(config_path)
# Check if any LLM provider has a valid API key
if not server_config.has_llm_provider():
# Try to inject DASHSCOPE_API_KEY from environment
dashscope_key = os.environ.get("DASHSCOPE_API_KEY", "")
if dashscope_key:
# Inject into the test provider config
for name, pconf in server_config.llm_config.providers.items():
if not pconf.api_key:
pconf.api_key = dashscope_key
# Set base_url for dashscope if missing
if not pconf.base_url:
pconf.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
break
if not server_config.has_llm_provider():
pytest.skip("No LLM provider with valid API key — skipping real LLM tests")
# Build real LLM gateway
llm_gateway = _build_llm_gateway(server_config)
# Build real skill registry from configs/skills
skill_registry = _build_skill_registry(server_config)
# Build real intent router
intent_router = IntentRouter(llm_gateway=llm_gateway)
# Build real CostAwareRouter
router_conf = server_config.router or {}
router = CostAwareRouter(
llm_gateway=llm_gateway,
model="default",
org_context=None,
auction_enabled=router_conf.get("auction_enabled", False),
classifier=router_conf.get("classifier", "heuristic"),
merged_llm_classify=router_conf.get("merged_llm_classify", True),
)
return router, skill_registry, intent_router
# Cache components at module level to avoid rebuilding for every test
_cached_components: tuple[CostAwareRouter, SkillRegistry, IntentRouter] | None = None
def _get_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRouter]:
"""Get or build real components (cached for session)."""
global _cached_components
if _cached_components is None:
_cached_components = _build_real_components()
return _cached_components
# ═══════════════════════════════════════════════════════════════════════════
# Helper: Run a single benchmark through the real router
# ═══════════════════════════════════════════════════════════════════════════
async def _run_router_benchmark(
benchmark: BenchmarkCase,
collector: MetricsCollector,
test_name: str,
is_paraphrase: bool = False,
input_override: str | None = None,
) -> dict:
"""Run a single benchmark through the real router."""
router, skill_registry, intent_router = _get_components()
query = input_override or benchmark.input
collector.start_timer(benchmark.id)
try:
result = await router.route(
content=query,
skill_registry=skill_registry,
intent_router=intent_router,
default_tools=[],
default_system_prompt=None,
)
actual_skill = result.skill_name
actual_exec_mode = result.execution_mode.value if result.execution_mode else None
actual_complexity = result.complexity
actual_match_method = result.match_method
actual_match_confidence = result.match_confidence
task_succeeded = True
error_msg = None
except Exception as e:
actual_skill = None
actual_exec_mode = None
actual_complexity = 0.0
actual_match_method = None
actual_match_confidence = 0.0
task_succeeded = False
error_msg = str(e)[:200]
# Map complexity score to level
if actual_complexity < 0.3:
actual_complexity_level = "low"
elif actual_complexity < 0.7:
actual_complexity_level = "medium"
else:
actual_complexity_level = "high"
# Judge correctness
skill_correct = None
if benchmark.expected_skill is not None and actual_skill is not None:
skill_correct = actual_skill == benchmark.expected_skill
elif benchmark.expected_skill is None:
skill_correct = actual_skill is None or task_succeeded
execution_mode_correct = None
if actual_exec_mode is not None and benchmark.expected_execution_mode:
mode_map = {
"direct": "DIRECT_CHAT",
"react": "SKILL_REACT",
"rewoo": "REWOO",
"reflexion": "REFLEXION",
"plan_exec": "PLAN_EXEC",
"team_collab": "TEAM_COLLAB",
"llm_generate": "SKILL_REACT",
"tool_call": "SKILL_REACT",
"custom": "SKILL_REACT",
}
expected_normalized = mode_map.get(
benchmark.expected_execution_mode, benchmark.expected_execution_mode.upper()
)
execution_mode_correct = actual_exec_mode.upper() == expected_normalized
complexity_correct = actual_complexity_level == benchmark.expected_complexity
obs = collector.record_benchmark_result(
benchmark,
test_name=test_name,
actual_skill=actual_skill,
actual_execution_mode=actual_exec_mode,
actual_status_code=200 if task_succeeded else 500,
task_succeeded=task_succeeded,
is_paraphrase=is_paraphrase,
error_message=error_msg,
)
obs.complexity_correct = complexity_correct
return {
"skill_correct": skill_correct,
"execution_mode_correct": execution_mode_correct,
"complexity_correct": complexity_correct,
"actual_skill": actual_skill,
"actual_exec_mode": actual_exec_mode,
"actual_complexity": actual_complexity,
"actual_complexity_level": actual_complexity_level,
"actual_match_method": actual_match_method,
"actual_match_confidence": actual_match_confidence,
"task_succeeded": task_succeeded,
}
# ═══════════════════════════════════════════════════════════════════════════
# Layer 0: Rule Matching Tests
# ═══════════════════════════════════════════════════════════════════════════
@pytest.mark.e2e_capability
class TestRouterLayer0:
"""Test Layer 0 rule matching with real router."""
@pytest.mark.parametrize(
"benchmark",
[
b
for b in ROUTING_EDGE_BENCHMARKS
if b.subcategory in ("greeting", "identity", "explicit_prefix")
],
ids=[
b.id
for b in ROUTING_EDGE_BENCHMARKS
if b.subcategory in ("greeting", "identity", "explicit_prefix")
],
)
def test_layer0_rules(self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector):
"""Layer 0 should correctly match greetings, identity, and @skill: prefix."""
result = asyncio.run(
_run_router_benchmark(benchmark, metrics_collector, f"layer0_{benchmark.id}")
)
if benchmark.subcategory == "greeting":
assert result["actual_match_method"] in ("layer0", None) or result["task_succeeded"]
if benchmark.subcategory == "explicit_prefix":
assert result["actual_skill"] == benchmark.expected_skill or result["task_succeeded"]
# ═══════════════════════════════════════════════════════════════════════════
# Layer 1: Complexity Classification Tests
# ═══════════════════════════════════════════════════════════════════════════
@pytest.mark.e2e_capability
class TestRouterLayer1:
"""Test Layer 1 complexity classification with real router."""
@pytest.mark.parametrize(
"benchmark",
ROUTING_KEYWORD_BENCHMARKS,
ids=[b.id for b in ROUTING_KEYWORD_BENCHMARKS],
)
def test_complexity_classification(
self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector
):
"""HeuristicClassifier should correctly estimate complexity."""
asyncio.run(_run_router_benchmark(benchmark, metrics_collector, f"layer1_{benchmark.id}"))
# ═══════════════════════════════════════════════════════════════════════════
# Semantic Router Tests
# ═══════════════════════════════════════════════════════════════════════════
@pytest.mark.e2e_capability
class TestSemanticRouter:
"""Test semantic router matching with real router."""
@pytest.mark.parametrize(
"benchmark",
SEMANTIC_ROUTER_BENCHMARKS,
ids=[b.id for b in SEMANTIC_ROUTER_BENCHMARKS],
)
def test_semantic_match(self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector):
"""SemanticRouter should match skill descriptions."""
asyncio.run(_run_router_benchmark(benchmark, metrics_collector, f"semantic_{benchmark.id}"))
# ═══════════════════════════════════════════════════════════════════════════
# Paraphrase Consistency Tests (Overfitting Detection)
# ═══════════════════════════════════════════════════════════════════════════
@pytest.mark.e2e_capability
class TestRouterParaphraseConsistency:
"""Test that paraphrased inputs route to the same skill as originals."""
@pytest.mark.parametrize(
"benchmark",
[b for b in ALL_BENCHMARKS if b.paraphrases and b.expected_skill is not None][:10],
ids=[b.id for b in ALL_BENCHMARKS if b.paraphrases and b.expected_skill is not None][:10],
)
def test_paraphrase_routes_same_skill(
self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector
):
"""Original and paraphrased inputs should route to the same skill."""
# Run original
asyncio.run(
_run_router_benchmark(benchmark, metrics_collector, f"para_orig_{benchmark.id}")
)
# Run paraphrases
for i, para in enumerate(benchmark.paraphrases):
asyncio.run(
_run_router_benchmark(
benchmark,
metrics_collector,
f"para_{benchmark.id}_{i}",
is_paraphrase=True,
input_override=para,
)
)