fischer-agentkit/tests/e2e/test_capability_router_dire...

"""E2E Agent Capability Tests — Router Direct Backtest Layer (Real LLM).

Directly tests CostAwareRouter.route() using real LLM configuration
loaded from agentkit.yaml. Records full SkillRoutingResult for precise
root cause analysis:
  - match_method (layer0/layer1/layer1.5/layer2)
  - match_confidence
  - complexity score
  - execution_trace
"""

import asyncio
import os
from pathlib import Path

import pytest

from agentkit.chat.skill_routing import CostAwareRouter
from agentkit.router.intent import IntentRouter
from agentkit.server.app import _build_llm_gateway, _build_skill_registry
from agentkit.server.config import ServerConfig
from agentkit.skills.registry import SkillRegistry

from tests.e2e.benchmark_dataset import (
    ALL_BENCHMARKS,
    ROUTING_KEYWORD_BENCHMARKS,
    ROUTING_EDGE_BENCHMARKS,
    SEMANTIC_ROUTER_BENCHMARKS,
    BenchmarkCase,
)
from tests.e2e.capability_metrics import MetricsCollector


# ═══════════════════════════════════════════════════════════════════════════
# Real component initialization from agentkit.yaml
# ═══════════════════════════════════════════════════════════════════════════


def _find_config_path() -> str | None:
    """Find agentkit.yaml in standard search paths."""
    candidates = [
        os.environ.get("AGENTKIT_CONFIG", ""),
        str(Path.cwd() / "agentkit.yaml"),
        str(Path.home() / ".agentkit" / "agentkit.yaml"),
    ]
    for path in candidates:
        if path and Path(path).is_file():
            return path
    return None


def _build_real_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRouter]:
    """Build real components from agentkit.yaml configuration.

    Returns (router, skill_registry, intent_router).
    Raises skip if no valid LLM provider is configured.
    """
    config_path = _find_config_path()
    if not config_path:
        pytest.skip("No agentkit.yaml found — cannot build real components")

    # Load .env if present
    env_path = Path(config_path).parent / ".env"
    if env_path.exists():
        try:
            from dotenv import load_dotenv

            load_dotenv(env_path)
        except ImportError:
            # python-dotenv not installed, manually parse .env
            with open(env_path) as f:
                for line in f:
                    line = line.strip()
                    if line and not line.startswith("#") and "=" in line:
                        key, _, value = line.partition("=")
                        os.environ.setdefault(key.strip(), value.strip().strip("'\""))

    server_config = ServerConfig.from_yaml(config_path)

    # Check if any LLM provider has a valid API key
    if not server_config.has_llm_provider():
        # Try to inject DASHSCOPE_API_KEY from environment
        dashscope_key = os.environ.get("DASHSCOPE_API_KEY", "")
        if dashscope_key:
            # Inject into the test provider config
            for name, pconf in server_config.llm_config.providers.items():
                if not pconf.api_key:
                    pconf.api_key = dashscope_key
                    # Set base_url for dashscope if missing
                    if not pconf.base_url:
                        pconf.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
                    break

    if not server_config.has_llm_provider():
        pytest.skip("No LLM provider with valid API key — skipping real LLM tests")

    # Build real LLM gateway
    llm_gateway = _build_llm_gateway(server_config)

    # Build real skill registry from configs/skills
    skill_registry = _build_skill_registry(server_config)

    # Build real intent router
    intent_router = IntentRouter(llm_gateway=llm_gateway)

    # Build real CostAwareRouter
    router_conf = server_config.router or {}
    router = CostAwareRouter(
        llm_gateway=llm_gateway,
        model="default",
        org_context=None,
        auction_enabled=router_conf.get("auction_enabled", False),
        classifier=router_conf.get("classifier", "heuristic"),
        merged_llm_classify=router_conf.get("merged_llm_classify", True),
    )

    return router, skill_registry, intent_router


# Cache components at module level to avoid rebuilding for every test
_cached_components: tuple[CostAwareRouter, SkillRegistry, IntentRouter] | None = None


def _get_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRouter]:
    """Get or build real components (cached for session)."""
    global _cached_components
    if _cached_components is None:
        _cached_components = _build_real_components()
    return _cached_components


# ═══════════════════════════════════════════════════════════════════════════
# Helper: Run a single benchmark through the real router
# ═══════════════════════════════════════════════════════════════════════════


async def _run_router_benchmark(
    benchmark: BenchmarkCase,
    collector: MetricsCollector,
    test_name: str,
    is_paraphrase: bool = False,
    input_override: str | None = None,
) -> dict:
    """Run a single benchmark through the real router."""
    router, skill_registry, intent_router = _get_components()
    query = input_override or benchmark.input

    collector.start_timer(benchmark.id)

    try:
        result = await router.route(
            content=query,
            skill_registry=skill_registry,
            intent_router=intent_router,
            default_tools=[],
            default_system_prompt=None,
        )

        actual_skill = result.skill_name
        actual_exec_mode = result.execution_mode.value if result.execution_mode else None
        actual_complexity = result.complexity
        actual_match_method = result.match_method
        actual_match_confidence = result.match_confidence
        task_succeeded = True
        error_msg = None
    except Exception as e:
        actual_skill = None
        actual_exec_mode = None
        actual_complexity = 0.0
        actual_match_method = None
        actual_match_confidence = 0.0
        task_succeeded = False
        error_msg = str(e)[:200]

    # Map complexity score to level
    if actual_complexity < 0.3:
        actual_complexity_level = "low"
    elif actual_complexity < 0.7:
        actual_complexity_level = "medium"
    else:
        actual_complexity_level = "high"

    # Judge correctness
    skill_correct = None
    if benchmark.expected_skill is not None and actual_skill is not None:
        skill_correct = actual_skill == benchmark.expected_skill
    elif benchmark.expected_skill is None:
        skill_correct = actual_skill is None or task_succeeded

    execution_mode_correct = None
    if actual_exec_mode is not None and benchmark.expected_execution_mode:
        mode_map = {
            "direct": "DIRECT_CHAT",
            "react": "SKILL_REACT",
            "rewoo": "REWOO",
            "reflexion": "REFLEXION",
            "plan_exec": "PLAN_EXEC",
            "team_collab": "TEAM_COLLAB",
            "llm_generate": "SKILL_REACT",
            "tool_call": "SKILL_REACT",
            "custom": "SKILL_REACT",
        }
        expected_normalized = mode_map.get(
            benchmark.expected_execution_mode, benchmark.expected_execution_mode.upper()
        )
        execution_mode_correct = actual_exec_mode.upper() == expected_normalized

    complexity_correct = actual_complexity_level == benchmark.expected_complexity

    obs = collector.record_benchmark_result(
        benchmark,
        test_name=test_name,
        actual_skill=actual_skill,
        actual_execution_mode=actual_exec_mode,
        actual_status_code=200 if task_succeeded else 500,
        task_succeeded=task_succeeded,
        is_paraphrase=is_paraphrase,
        error_message=error_msg,
    )
    obs.complexity_correct = complexity_correct

    return {
        "skill_correct": skill_correct,
        "execution_mode_correct": execution_mode_correct,
        "complexity_correct": complexity_correct,
        "actual_skill": actual_skill,
        "actual_exec_mode": actual_exec_mode,
        "actual_complexity": actual_complexity,
        "actual_complexity_level": actual_complexity_level,
        "actual_match_method": actual_match_method,
        "actual_match_confidence": actual_match_confidence,
        "task_succeeded": task_succeeded,
    }


# ═══════════════════════════════════════════════════════════════════════════
# Layer 0: Rule Matching Tests
# ═══════════════════════════════════════════════════════════════════════════


@pytest.mark.e2e_capability
class TestRouterLayer0:
    """Test Layer 0 rule matching with real router."""

    @pytest.mark.parametrize(
        "benchmark",
        [
            b
            for b in ROUTING_EDGE_BENCHMARKS
            if b.subcategory in ("greeting", "identity", "explicit_prefix")
        ],
        ids=[
            b.id
            for b in ROUTING_EDGE_BENCHMARKS
            if b.subcategory in ("greeting", "identity", "explicit_prefix")
        ],
    )
    def test_layer0_rules(self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector):
        """Layer 0 should correctly match greetings, identity, and @skill: prefix."""
        result = asyncio.run(
            _run_router_benchmark(benchmark, metrics_collector, f"layer0_{benchmark.id}")
        )
        if benchmark.subcategory == "greeting":
            assert result["actual_match_method"] in ("layer0", None) or result["task_succeeded"]
        if benchmark.subcategory == "explicit_prefix":
            assert result["actual_skill"] == benchmark.expected_skill or result["task_succeeded"]


# ═══════════════════════════════════════════════════════════════════════════
# Layer 1: Complexity Classification Tests
# ═══════════════════════════════════════════════════════════════════════════


@pytest.mark.e2e_capability
class TestRouterLayer1:
    """Test Layer 1 complexity classification with real router."""

    @pytest.mark.parametrize(
        "benchmark",
        ROUTING_KEYWORD_BENCHMARKS,
        ids=[b.id for b in ROUTING_KEYWORD_BENCHMARKS],
    )
    def test_complexity_classification(
        self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector
    ):
        """HeuristicClassifier should correctly estimate complexity."""
        asyncio.run(_run_router_benchmark(benchmark, metrics_collector, f"layer1_{benchmark.id}"))


# ═══════════════════════════════════════════════════════════════════════════
# Semantic Router Tests
# ═══════════════════════════════════════════════════════════════════════════


@pytest.mark.e2e_capability
class TestSemanticRouter:
    """Test semantic router matching with real router."""

    @pytest.mark.parametrize(
        "benchmark",
        SEMANTIC_ROUTER_BENCHMARKS,
        ids=[b.id for b in SEMANTIC_ROUTER_BENCHMARKS],
    )
    def test_semantic_match(self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector):
        """SemanticRouter should match skill descriptions."""
        asyncio.run(_run_router_benchmark(benchmark, metrics_collector, f"semantic_{benchmark.id}"))


# ═══════════════════════════════════════════════════════════════════════════
# Paraphrase Consistency Tests (Overfitting Detection)
# ═══════════════════════════════════════════════════════════════════════════


@pytest.mark.e2e_capability
class TestRouterParaphraseConsistency:
    """Test that paraphrased inputs route to the same skill as originals."""

    @pytest.mark.parametrize(
        "benchmark",
        [b for b in ALL_BENCHMARKS if b.paraphrases and b.expected_skill is not None][:10],
        ids=[b.id for b in ALL_BENCHMARKS if b.paraphrases and b.expected_skill is not None][:10],
    )
    def test_paraphrase_routes_same_skill(
        self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector
    ):
        """Original and paraphrased inputs should route to the same skill."""
        # Run original
        asyncio.run(
            _run_router_benchmark(benchmark, metrics_collector, f"para_orig_{benchmark.id}")
        )

        # Run paraphrases
        for i, para in enumerate(benchmark.paraphrases):
            asyncio.run(
                _run_router_benchmark(
                    benchmark,
                    metrics_collector,
                    f"para_{benchmark.id}_{i}",
                    is_paraphrase=True,
                    input_override=para,
                )
            )