406 lines
17 KiB
Python
406 lines
17 KiB
Python
"""E2E Agent Capability Tests — Router Direct Backtest Layer (Real LLM).
|
|
|
|
Directly tests CostAwareRouter.route() using real LLM configuration
|
|
loaded from agentkit.yaml. Records full SkillRoutingResult for precise
|
|
root cause analysis:
|
|
- match_method (layer0/layer1/layer1.5/layer2)
|
|
- match_confidence
|
|
- complexity score
|
|
- execution_trace
|
|
"""
|
|
|
|
import asyncio
|
|
import os
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from agentkit.chat.skill_routing import CostAwareRouter
|
|
from agentkit.router.intent import IntentRouter
|
|
from agentkit.server.app import _build_llm_gateway, _build_skill_registry
|
|
from agentkit.server.config import ServerConfig
|
|
from agentkit.skills.registry import SkillRegistry
|
|
|
|
from tests.e2e.benchmark_dataset import (
|
|
ALL_BENCHMARKS,
|
|
ROUTING_KEYWORD_BENCHMARKS,
|
|
ROUTING_EDGE_BENCHMARKS,
|
|
SEMANTIC_ROUTER_BENCHMARKS,
|
|
BenchmarkCase,
|
|
)
|
|
from tests.e2e.capability_metrics import MetricsCollector
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Real component initialization from agentkit.yaml
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
def _find_config_path() -> str | None:
|
|
"""Find agentkit.yaml in standard search paths."""
|
|
candidates = [
|
|
os.environ.get("AGENTKIT_CONFIG", ""),
|
|
str(Path.cwd() / "agentkit.yaml"),
|
|
str(Path.home() / ".agentkit" / "agentkit.yaml"),
|
|
]
|
|
for path in candidates:
|
|
if path and Path(path).is_file():
|
|
return path
|
|
return None
|
|
|
|
|
|
def _build_real_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRouter]:
|
|
"""Build real components from agentkit.yaml configuration.
|
|
|
|
Returns (router, skill_registry, intent_router).
|
|
Raises skip if no valid LLM provider is configured.
|
|
"""
|
|
config_path = _find_config_path()
|
|
if not config_path:
|
|
pytest.skip("No agentkit.yaml found — cannot build real components")
|
|
|
|
# Load .env if present
|
|
env_path = Path(config_path).parent / ".env"
|
|
if env_path.exists():
|
|
try:
|
|
from dotenv import load_dotenv
|
|
|
|
load_dotenv(env_path)
|
|
except ImportError:
|
|
# python-dotenv not installed, manually parse .env
|
|
with open(env_path) as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if line and not line.startswith("#") and "=" in line:
|
|
key, _, value = line.partition("=")
|
|
os.environ.setdefault(key.strip(), value.strip().strip("'\""))
|
|
|
|
server_config = ServerConfig.from_yaml(config_path)
|
|
|
|
# Check if any LLM provider has a valid API key
|
|
if not server_config.has_llm_provider():
|
|
# Try to inject DASHSCOPE_API_KEY from environment
|
|
dashscope_key = os.environ.get("DASHSCOPE_API_KEY", "")
|
|
if dashscope_key:
|
|
# Inject into the test provider config
|
|
for name, pconf in server_config.llm_config.providers.items():
|
|
if not pconf.api_key:
|
|
pconf.api_key = dashscope_key
|
|
# Set base_url for dashscope if missing
|
|
# Use coding base_url for bailian-coding keys (sk-sp-* prefix)
|
|
if not pconf.base_url:
|
|
if dashscope_key.startswith("sk-sp-"):
|
|
pconf.base_url = "https://coding.dashscope.aliyuncs.com/v1"
|
|
else:
|
|
pconf.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
|
|
break
|
|
|
|
if not server_config.has_llm_provider():
|
|
pytest.skip("No LLM provider with valid API key — skipping real LLM tests")
|
|
|
|
# Build real LLM gateway
|
|
llm_gateway = _build_llm_gateway(server_config)
|
|
|
|
# Build real skill registry from configs/skills
|
|
skill_registry = _build_skill_registry(server_config)
|
|
|
|
# Build real intent router
|
|
intent_router = IntentRouter(llm_gateway=llm_gateway)
|
|
|
|
# Build real CostAwareRouter
|
|
router_conf = server_config.router or {}
|
|
|
|
# Build SemanticRouter if enabled or if embedding is available
|
|
semantic_router = None
|
|
semantic_conf = router_conf.get("semantic", {})
|
|
if semantic_conf.get("enabled", False):
|
|
try:
|
|
from agentkit.chat.semantic_router import SemanticRouter
|
|
from agentkit.memory.embedder import OpenAIEmbedder
|
|
|
|
# Try to get embedder from LLM gateway cache first
|
|
embedder = getattr(llm_gateway, "_embedder", None)
|
|
|
|
# If no cache embedder, create one directly from provider config
|
|
if embedder is None:
|
|
# Find a provider with an API key to use for embedding
|
|
for pname, pconf in server_config.llm_config.providers.items():
|
|
if pconf.api_key:
|
|
# Use correct base_url based on key prefix
|
|
if pconf.api_key.startswith("sk-sp-"):
|
|
base_url = pconf.base_url or "https://coding.dashscope.aliyuncs.com/v1"
|
|
else:
|
|
base_url = pconf.base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1"
|
|
embedder = OpenAIEmbedder(
|
|
api_key=pconf.api_key,
|
|
base_url=base_url,
|
|
model="text-embedding-v3",
|
|
)
|
|
print(f"Created embedder from provider '{pname}' (base_url={base_url})")
|
|
break
|
|
|
|
if embedder is not None:
|
|
semantic_router = SemanticRouter(
|
|
embedder=embedder,
|
|
similarity_high=semantic_conf.get("similarity_high", 0.85),
|
|
similarity_low=semantic_conf.get("similarity_low", 0.4),
|
|
)
|
|
# Build skill embedding index
|
|
import asyncio
|
|
|
|
try:
|
|
loop = asyncio.get_running_loop()
|
|
except RuntimeError:
|
|
loop = None
|
|
|
|
if loop and loop.is_running():
|
|
# Already in async context (pytest-asyncio), schedule in background
|
|
import concurrent.futures
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as pool:
|
|
pool.submit(asyncio.run, semantic_router.build_index(skill_registry)).result()
|
|
else:
|
|
asyncio.run(semantic_router.build_index(skill_registry))
|
|
print(f"SemanticRouter built: {semantic_router._index.size} skills indexed")
|
|
else:
|
|
print("Warning: No embedder available for SemanticRouter")
|
|
except Exception as e:
|
|
print(f"Warning: SemanticRouter not available: {e}")
|
|
|
|
router = CostAwareRouter(
|
|
llm_gateway=llm_gateway,
|
|
model="default",
|
|
org_context=None,
|
|
auction_enabled=router_conf.get("auction_enabled", False),
|
|
classifier=router_conf.get("classifier", "heuristic"),
|
|
merged_llm_classify=router_conf.get("merged_llm_classify", True),
|
|
semantic_router=semantic_router,
|
|
)
|
|
|
|
return router, skill_registry, intent_router
|
|
|
|
|
|
# Cache components at module level to avoid rebuilding for every test
|
|
_cached_components: tuple[CostAwareRouter, SkillRegistry, IntentRouter] | None = None
|
|
|
|
|
|
def _get_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRouter]:
|
|
"""Get or build real components (cached for session)."""
|
|
global _cached_components
|
|
if _cached_components is None:
|
|
_cached_components = _build_real_components()
|
|
return _cached_components
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Helper: Run a single benchmark through the real router
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
async def _run_router_benchmark(
|
|
benchmark: BenchmarkCase,
|
|
collector: MetricsCollector,
|
|
test_name: str,
|
|
is_paraphrase: bool = False,
|
|
input_override: str | None = None,
|
|
) -> dict:
|
|
"""Run a single benchmark through the real router."""
|
|
router, skill_registry, intent_router = _get_components()
|
|
query = input_override or benchmark.input
|
|
|
|
collector.start_timer(benchmark.id)
|
|
|
|
try:
|
|
result = await router.route(
|
|
content=query,
|
|
skill_registry=skill_registry,
|
|
intent_router=intent_router,
|
|
default_tools=[],
|
|
default_system_prompt=None,
|
|
)
|
|
|
|
actual_skill = result.skill_name
|
|
actual_exec_mode = result.execution_mode.value if result.execution_mode else None
|
|
actual_complexity = result.complexity
|
|
actual_match_method = result.match_method
|
|
actual_match_confidence = result.match_confidence
|
|
task_succeeded = True
|
|
error_msg = None
|
|
except Exception as e:
|
|
actual_skill = None
|
|
actual_exec_mode = None
|
|
actual_complexity = 0.0
|
|
actual_match_method = None
|
|
actual_match_confidence = 0.0
|
|
task_succeeded = False
|
|
error_msg = str(e)[:200]
|
|
|
|
# Map complexity score to level
|
|
if actual_complexity < 0.3:
|
|
actual_complexity_level = "low"
|
|
elif actual_complexity < 0.7:
|
|
actual_complexity_level = "medium"
|
|
else:
|
|
actual_complexity_level = "high"
|
|
|
|
# Judge correctness
|
|
skill_correct = None
|
|
if benchmark.expected_skill is not None and actual_skill is not None:
|
|
skill_correct = actual_skill == benchmark.expected_skill
|
|
elif benchmark.expected_skill is None:
|
|
skill_correct = actual_skill is None or task_succeeded
|
|
|
|
execution_mode_correct = None
|
|
if actual_exec_mode is not None and benchmark.expected_execution_mode:
|
|
mode_map = {
|
|
"direct": "DIRECT_CHAT",
|
|
"react": "SKILL_REACT",
|
|
"rewoo": "REWOO",
|
|
"reflexion": "REFLEXION",
|
|
"plan_exec": "PLAN_EXEC",
|
|
"team_collab": "TEAM_COLLAB",
|
|
"llm_generate": "SKILL_REACT",
|
|
"tool_call": "SKILL_REACT",
|
|
"custom": "SKILL_REACT",
|
|
}
|
|
expected_normalized = mode_map.get(
|
|
benchmark.expected_execution_mode, benchmark.expected_execution_mode.upper()
|
|
)
|
|
execution_mode_correct = actual_exec_mode.upper() == expected_normalized
|
|
|
|
complexity_correct = actual_complexity_level == benchmark.expected_complexity
|
|
|
|
obs = collector.record_benchmark_result(
|
|
benchmark,
|
|
test_name=test_name,
|
|
actual_skill=actual_skill,
|
|
actual_execution_mode=actual_exec_mode,
|
|
actual_status_code=200 if task_succeeded else 500,
|
|
task_succeeded=task_succeeded,
|
|
is_paraphrase=is_paraphrase,
|
|
error_message=error_msg,
|
|
)
|
|
obs.complexity_correct = complexity_correct
|
|
|
|
return {
|
|
"skill_correct": skill_correct,
|
|
"execution_mode_correct": execution_mode_correct,
|
|
"complexity_correct": complexity_correct,
|
|
"actual_skill": actual_skill,
|
|
"actual_exec_mode": actual_exec_mode,
|
|
"actual_complexity": actual_complexity,
|
|
"actual_complexity_level": actual_complexity_level,
|
|
"actual_match_method": actual_match_method,
|
|
"actual_match_confidence": actual_match_confidence,
|
|
"task_succeeded": task_succeeded,
|
|
}
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Layer 0: Rule Matching Tests
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
@pytest.mark.e2e_capability
|
|
class TestRouterLayer0:
|
|
"""Test Layer 0 rule matching with real router."""
|
|
|
|
@pytest.mark.parametrize(
|
|
"benchmark",
|
|
[
|
|
b
|
|
for b in ROUTING_EDGE_BENCHMARKS
|
|
if b.subcategory in ("greeting", "identity", "explicit_prefix")
|
|
],
|
|
ids=[
|
|
b.id
|
|
for b in ROUTING_EDGE_BENCHMARKS
|
|
if b.subcategory in ("greeting", "identity", "explicit_prefix")
|
|
],
|
|
)
|
|
def test_layer0_rules(self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector):
|
|
"""Layer 0 should correctly match greetings, identity, and @skill: prefix."""
|
|
result = asyncio.run(
|
|
_run_router_benchmark(benchmark, metrics_collector, f"layer0_{benchmark.id}")
|
|
)
|
|
if benchmark.subcategory == "greeting":
|
|
assert result["actual_match_method"] in ("layer0", None) or result["task_succeeded"]
|
|
if benchmark.subcategory == "explicit_prefix":
|
|
assert result["actual_skill"] == benchmark.expected_skill or result["task_succeeded"]
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Layer 1: Complexity Classification Tests
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
@pytest.mark.e2e_capability
|
|
class TestRouterLayer1:
|
|
"""Test Layer 1 complexity classification with real router."""
|
|
|
|
@pytest.mark.parametrize(
|
|
"benchmark",
|
|
ROUTING_KEYWORD_BENCHMARKS,
|
|
ids=[b.id for b in ROUTING_KEYWORD_BENCHMARKS],
|
|
)
|
|
def test_complexity_classification(
|
|
self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector
|
|
):
|
|
"""HeuristicClassifier should correctly estimate complexity."""
|
|
asyncio.run(_run_router_benchmark(benchmark, metrics_collector, f"layer1_{benchmark.id}"))
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Semantic Router Tests
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
@pytest.mark.e2e_capability
|
|
class TestSemanticRouter:
|
|
"""Test semantic router matching with real router."""
|
|
|
|
@pytest.mark.parametrize(
|
|
"benchmark",
|
|
SEMANTIC_ROUTER_BENCHMARKS,
|
|
ids=[b.id for b in SEMANTIC_ROUTER_BENCHMARKS],
|
|
)
|
|
def test_semantic_match(self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector):
|
|
"""SemanticRouter should match skill descriptions."""
|
|
asyncio.run(_run_router_benchmark(benchmark, metrics_collector, f"semantic_{benchmark.id}"))
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Paraphrase Consistency Tests (Overfitting Detection)
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
@pytest.mark.e2e_capability
|
|
class TestRouterParaphraseConsistency:
|
|
"""Test that paraphrased inputs route to the same skill as originals."""
|
|
|
|
@pytest.mark.parametrize(
|
|
"benchmark",
|
|
[b for b in ALL_BENCHMARKS if b.paraphrases and b.expected_skill is not None][:10],
|
|
ids=[b.id for b in ALL_BENCHMARKS if b.paraphrases and b.expected_skill is not None][:10],
|
|
)
|
|
def test_paraphrase_routes_same_skill(
|
|
self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector
|
|
):
|
|
"""Original and paraphrased inputs should route to the same skill."""
|
|
# Run original
|
|
asyncio.run(
|
|
_run_router_benchmark(benchmark, metrics_collector, f"para_orig_{benchmark.id}")
|
|
)
|
|
|
|
# Run paraphrases
|
|
for i, para in enumerate(benchmark.paraphrases):
|
|
asyncio.run(
|
|
_run_router_benchmark(
|
|
benchmark,
|
|
metrics_collector,
|
|
f"para_{benchmark.id}_{i}",
|
|
is_paraphrase=True,
|
|
input_override=para,
|
|
)
|
|
)
|