fischer-agentkit/tests/e2e/test_capability_routing.py

274 lines
11 KiB
Python

"""E2E Agent Capability Tests — Intent Routing Intelligence with Metrics Collection.
Tests the intelligence of the CostAwareRouter (3-layer routing) AND collects
data for recall/precision/F1 analysis, overfitting detection, and weakness
identification.
Each test:
1. Runs the benchmark case (original input)
2. Runs all paraphrases of the same input (overfitting detection)
3. Records observations to MetricsCollector
4. Asserts minimum quality thresholds
"""
import pytest
import httpx
from tests.e2e.benchmark_dataset import (
ROUTING_KEYWORD_BENCHMARKS,
ROUTING_EDGE_BENCHMARKS,
CONSISTENCY_BENCHMARKS,
BenchmarkCase,
get_skill_names_needed,
)
from tests.e2e.capability_metrics import MetricsCollector
from tests.e2e.conftest import register_skill_via_api
# ═══════════════════════════════════════════════════════════════════════════
# Pre-registration of all skills needed by benchmarks
# ═══════════════════════════════════════════════════════════════════════════
@pytest.fixture(autouse=True, scope="module")
def register_benchmark_skills(api_client: httpx.Client):
"""Auto-register all skills needed by routing benchmarks."""
for skill_name in get_skill_names_needed():
register_skill_via_api(api_client, skill_name, keywords=[skill_name])
# ═══════════════════════════════════════════════════════════════════════════
# Helper: run a single benchmark case and record metrics
# ═══════════════════════════════════════════════════════════════════════════
def _run_benchmark_and_record(
benchmark: BenchmarkCase,
api_client: httpx.Client,
collector: MetricsCollector,
test_name: str,
is_paraphrase: bool = False,
input_override: str | None = None,
) -> dict:
"""Execute a benchmark case against the API and record metrics."""
query = input_override or benchmark.input
collector.start_timer(benchmark.id)
payload: dict = {"input_data": {"query": query}}
if benchmark.expected_skill is not None:
payload["skill_name"] = benchmark.expected_skill
resp = api_client.post("/api/v1/tasks", json=payload)
actual_skill = None
actual_exec_mode = None
actual_keys = []
task_succeeded = resp.status_code == 200
error_msg = None
if task_succeeded:
data = resp.json()
actual_skill = data.get("skill_name")
actual_exec_mode = data.get("execution_mode")
actual_keys = list(data.keys())
elif resp.status_code >= 400:
try:
error_msg = resp.json().get("detail", resp.text[:200])
except Exception:
error_msg = resp.text[:200]
collector.record_benchmark_result(
benchmark,
test_name=test_name,
actual_skill=actual_skill,
actual_execution_mode=actual_exec_mode,
actual_status_code=resp.status_code,
actual_response_keys=actual_keys,
task_succeeded=task_succeeded,
is_paraphrase=is_paraphrase,
error_message=error_msg,
)
return {
"status_code": resp.status_code,
"actual_skill": actual_skill,
"actual_exec_mode": actual_exec_mode,
"task_succeeded": task_succeeded,
}
# ═══════════════════════════════════════════════════════════════════════════
# Parameterized Routing Benchmark Tests
# ═══════════════════════════════════════════════════════════════════════════
@pytest.mark.e2e_capability
class TestRoutingBenchmarks:
"""Run all routing benchmarks with metrics collection."""
@pytest.mark.parametrize(
"benchmark",
ROUTING_KEYWORD_BENCHMARKS + ROUTING_EDGE_BENCHMARKS,
ids=[b.id for b in ROUTING_KEYWORD_BENCHMARKS + ROUTING_EDGE_BENCHMARKS],
)
def test_routing_benchmark(
self,
benchmark: BenchmarkCase,
api_client: httpx.Client,
metrics_collector: MetricsCollector,
):
"""Run original benchmark input and record metrics."""
result = _run_benchmark_and_record(
benchmark,
api_client,
metrics_collector,
test_name=f"routing_benchmark_{benchmark.id}",
)
assert result["status_code"] == 200, f"Benchmark {benchmark.id} failed: {result}"
@pytest.mark.parametrize(
"benchmark",
[b for b in ROUTING_KEYWORD_BENCHMARKS + ROUTING_EDGE_BENCHMARKS if b.paraphrases],
ids=[b.id for b in ROUTING_KEYWORD_BENCHMARKS + ROUTING_EDGE_BENCHMARKS if b.paraphrases],
)
def test_routing_paraphrase(
self,
benchmark: BenchmarkCase,
api_client: httpx.Client,
metrics_collector: MetricsCollector,
):
"""Run all paraphrases for overfitting detection."""
for i, paraphrase in enumerate(benchmark.paraphrases):
_run_benchmark_and_record(
benchmark,
api_client,
metrics_collector,
test_name=f"routing_paraphrase_{benchmark.id}_{i}",
is_paraphrase=True,
input_override=paraphrase,
)
# ═══════════════════════════════════════════════════════════════════════════
# Routing Consistency (same input, multiple runs)
# ═══════════════════════════════════════════════════════════════════════════
@pytest.mark.e2e_capability
class TestRoutingConsistency:
"""Same input should produce same routing decision (deterministic backtest)."""
def test_same_query_same_skill(
self,
api_client: httpx.Client,
metrics_collector: MetricsCollector,
):
"""Submitting the same query multiple times should route to the same skill."""
for benchmark in CONSISTENCY_BENCHMARKS:
skills_seen: list[str | None] = []
for run_idx in range(3):
result = _run_benchmark_and_record(
benchmark,
api_client,
metrics_collector,
test_name=f"consistency_{benchmark.id}_run{run_idx}",
)
skills_seen.append(result["actual_skill"])
# All runs should produce the same skill
non_none_skills = [s for s in skills_seen if s is not None]
if len(non_none_skills) >= 2:
assert len(set(non_none_skills)) == 1, (
f"Inconsistent routing for {benchmark.id}: {skills_seen}"
)
# ═══════════════════════════════════════════════════════════════════════════
# Routing Disambiguation (specific edge cases)
# ═══════════════════════════════════════════════════════════════════════════
@pytest.mark.e2e_capability
class TestRoutingDisambiguation:
"""When multiple skills could match, the router should pick the best one."""
def test_overlapping_keywords_routes_to_best_match(
self,
api_client: httpx.Client,
metrics_collector: MetricsCollector,
):
"""With overlapping keywords, router should pick the most relevant skill."""
register_skill_via_api(
api_client,
"python_coder",
keywords=["python", "code", "programming"],
)
register_skill_via_api(
api_client,
"javascript_coder",
keywords=["javascript", "code", "programming"],
)
benchmark = BenchmarkCase(
id="disambig-python-001",
input="Write a Python function to sort a list",
expected_skill="python_coder",
expected_complexity="medium",
category="routing",
subcategory="disambiguation",
paraphrases=["I need a Python sorting algorithm", "用Python写个排序函数"],
)
result = _run_benchmark_and_record(
benchmark,
api_client,
metrics_collector,
test_name="disambig_python",
)
assert result["status_code"] == 200
# Also test paraphrases for overfitting detection
for i, para in enumerate(benchmark.paraphrases):
_run_benchmark_and_record(
benchmark,
api_client,
metrics_collector,
test_name=f"disambig_python_para_{i}",
is_paraphrase=True,
input_override=para,
)
def test_no_matching_skill_falls_back_gracefully(
self,
api_client: httpx.Client,
metrics_collector: MetricsCollector,
):
"""When no skill matches, should fall back to direct chat."""
benchmark = BenchmarkCase(
id="fallback-nomatch-001",
input="Tell me about quantum physics",
expected_skill=None,
expected_complexity="low",
category="routing",
subcategory="fallback",
paraphrases=["Explain quantum mechanics", "量子物理是什么"],
)
result = _run_benchmark_and_record(
benchmark,
api_client,
metrics_collector,
test_name="fallback_nomatch",
)
assert result["status_code"] == 200
for i, para in enumerate(benchmark.paraphrases):
_run_benchmark_and_record(
benchmark,
api_client,
metrics_collector,
test_name=f"fallback_nomatch_para_{i}",
is_paraphrase=True,
input_override=para,
)