274 lines
11 KiB
Python
274 lines
11 KiB
Python
"""E2E Agent Capability Tests — Intent Routing Intelligence with Metrics Collection.
|
|
|
|
Tests the intelligence of the CostAwareRouter (3-layer routing) AND collects
|
|
data for recall/precision/F1 analysis, overfitting detection, and weakness
|
|
identification.
|
|
|
|
Each test:
|
|
1. Runs the benchmark case (original input)
|
|
2. Runs all paraphrases of the same input (overfitting detection)
|
|
3. Records observations to MetricsCollector
|
|
4. Asserts minimum quality thresholds
|
|
"""
|
|
|
|
import pytest
|
|
import httpx
|
|
|
|
from tests.e2e.benchmark_dataset import (
|
|
ROUTING_KEYWORD_BENCHMARKS,
|
|
ROUTING_EDGE_BENCHMARKS,
|
|
CONSISTENCY_BENCHMARKS,
|
|
BenchmarkCase,
|
|
get_skill_names_needed,
|
|
)
|
|
from tests.e2e.capability_metrics import MetricsCollector
|
|
from tests.e2e.conftest import register_skill_via_api
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Pre-registration of all skills needed by benchmarks
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
@pytest.fixture(autouse=True, scope="module")
|
|
def register_benchmark_skills(api_client: httpx.Client):
|
|
"""Auto-register all skills needed by routing benchmarks."""
|
|
for skill_name in get_skill_names_needed():
|
|
register_skill_via_api(api_client, skill_name, keywords=[skill_name])
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Helper: run a single benchmark case and record metrics
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
def _run_benchmark_and_record(
|
|
benchmark: BenchmarkCase,
|
|
api_client: httpx.Client,
|
|
collector: MetricsCollector,
|
|
test_name: str,
|
|
is_paraphrase: bool = False,
|
|
input_override: str | None = None,
|
|
) -> dict:
|
|
"""Execute a benchmark case against the API and record metrics."""
|
|
query = input_override or benchmark.input
|
|
collector.start_timer(benchmark.id)
|
|
|
|
payload: dict = {"input_data": {"query": query}}
|
|
if benchmark.expected_skill is not None:
|
|
payload["skill_name"] = benchmark.expected_skill
|
|
|
|
resp = api_client.post("/api/v1/tasks", json=payload)
|
|
|
|
actual_skill = None
|
|
actual_exec_mode = None
|
|
actual_keys = []
|
|
task_succeeded = resp.status_code == 200
|
|
error_msg = None
|
|
|
|
if task_succeeded:
|
|
data = resp.json()
|
|
actual_skill = data.get("skill_name")
|
|
actual_exec_mode = data.get("execution_mode")
|
|
actual_keys = list(data.keys())
|
|
elif resp.status_code >= 400:
|
|
try:
|
|
error_msg = resp.json().get("detail", resp.text[:200])
|
|
except Exception:
|
|
error_msg = resp.text[:200]
|
|
|
|
collector.record_benchmark_result(
|
|
benchmark,
|
|
test_name=test_name,
|
|
actual_skill=actual_skill,
|
|
actual_execution_mode=actual_exec_mode,
|
|
actual_status_code=resp.status_code,
|
|
actual_response_keys=actual_keys,
|
|
task_succeeded=task_succeeded,
|
|
is_paraphrase=is_paraphrase,
|
|
error_message=error_msg,
|
|
)
|
|
|
|
return {
|
|
"status_code": resp.status_code,
|
|
"actual_skill": actual_skill,
|
|
"actual_exec_mode": actual_exec_mode,
|
|
"task_succeeded": task_succeeded,
|
|
}
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Parameterized Routing Benchmark Tests
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
@pytest.mark.e2e_capability
|
|
class TestRoutingBenchmarks:
|
|
"""Run all routing benchmarks with metrics collection."""
|
|
|
|
@pytest.mark.parametrize(
|
|
"benchmark",
|
|
ROUTING_KEYWORD_BENCHMARKS + ROUTING_EDGE_BENCHMARKS,
|
|
ids=[b.id for b in ROUTING_KEYWORD_BENCHMARKS + ROUTING_EDGE_BENCHMARKS],
|
|
)
|
|
def test_routing_benchmark(
|
|
self,
|
|
benchmark: BenchmarkCase,
|
|
api_client: httpx.Client,
|
|
metrics_collector: MetricsCollector,
|
|
):
|
|
"""Run original benchmark input and record metrics."""
|
|
result = _run_benchmark_and_record(
|
|
benchmark,
|
|
api_client,
|
|
metrics_collector,
|
|
test_name=f"routing_benchmark_{benchmark.id}",
|
|
)
|
|
assert result["status_code"] == 200, f"Benchmark {benchmark.id} failed: {result}"
|
|
|
|
@pytest.mark.parametrize(
|
|
"benchmark",
|
|
[b for b in ROUTING_KEYWORD_BENCHMARKS + ROUTING_EDGE_BENCHMARKS if b.paraphrases],
|
|
ids=[b.id for b in ROUTING_KEYWORD_BENCHMARKS + ROUTING_EDGE_BENCHMARKS if b.paraphrases],
|
|
)
|
|
def test_routing_paraphrase(
|
|
self,
|
|
benchmark: BenchmarkCase,
|
|
api_client: httpx.Client,
|
|
metrics_collector: MetricsCollector,
|
|
):
|
|
"""Run all paraphrases for overfitting detection."""
|
|
for i, paraphrase in enumerate(benchmark.paraphrases):
|
|
_run_benchmark_and_record(
|
|
benchmark,
|
|
api_client,
|
|
metrics_collector,
|
|
test_name=f"routing_paraphrase_{benchmark.id}_{i}",
|
|
is_paraphrase=True,
|
|
input_override=paraphrase,
|
|
)
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Routing Consistency (same input, multiple runs)
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
@pytest.mark.e2e_capability
|
|
class TestRoutingConsistency:
|
|
"""Same input should produce same routing decision (deterministic backtest)."""
|
|
|
|
def test_same_query_same_skill(
|
|
self,
|
|
api_client: httpx.Client,
|
|
metrics_collector: MetricsCollector,
|
|
):
|
|
"""Submitting the same query multiple times should route to the same skill."""
|
|
for benchmark in CONSISTENCY_BENCHMARKS:
|
|
skills_seen: list[str | None] = []
|
|
for run_idx in range(3):
|
|
result = _run_benchmark_and_record(
|
|
benchmark,
|
|
api_client,
|
|
metrics_collector,
|
|
test_name=f"consistency_{benchmark.id}_run{run_idx}",
|
|
)
|
|
skills_seen.append(result["actual_skill"])
|
|
|
|
# All runs should produce the same skill
|
|
non_none_skills = [s for s in skills_seen if s is not None]
|
|
if len(non_none_skills) >= 2:
|
|
assert len(set(non_none_skills)) == 1, (
|
|
f"Inconsistent routing for {benchmark.id}: {skills_seen}"
|
|
)
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Routing Disambiguation (specific edge cases)
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
@pytest.mark.e2e_capability
|
|
class TestRoutingDisambiguation:
|
|
"""When multiple skills could match, the router should pick the best one."""
|
|
|
|
def test_overlapping_keywords_routes_to_best_match(
|
|
self,
|
|
api_client: httpx.Client,
|
|
metrics_collector: MetricsCollector,
|
|
):
|
|
"""With overlapping keywords, router should pick the most relevant skill."""
|
|
register_skill_via_api(
|
|
api_client,
|
|
"python_coder",
|
|
keywords=["python", "code", "programming"],
|
|
)
|
|
register_skill_via_api(
|
|
api_client,
|
|
"javascript_coder",
|
|
keywords=["javascript", "code", "programming"],
|
|
)
|
|
|
|
benchmark = BenchmarkCase(
|
|
id="disambig-python-001",
|
|
input="Write a Python function to sort a list",
|
|
expected_skill="python_coder",
|
|
expected_complexity="medium",
|
|
category="routing",
|
|
subcategory="disambiguation",
|
|
paraphrases=["I need a Python sorting algorithm", "用Python写个排序函数"],
|
|
)
|
|
|
|
result = _run_benchmark_and_record(
|
|
benchmark,
|
|
api_client,
|
|
metrics_collector,
|
|
test_name="disambig_python",
|
|
)
|
|
assert result["status_code"] == 200
|
|
|
|
# Also test paraphrases for overfitting detection
|
|
for i, para in enumerate(benchmark.paraphrases):
|
|
_run_benchmark_and_record(
|
|
benchmark,
|
|
api_client,
|
|
metrics_collector,
|
|
test_name=f"disambig_python_para_{i}",
|
|
is_paraphrase=True,
|
|
input_override=para,
|
|
)
|
|
|
|
def test_no_matching_skill_falls_back_gracefully(
|
|
self,
|
|
api_client: httpx.Client,
|
|
metrics_collector: MetricsCollector,
|
|
):
|
|
"""When no skill matches, should fall back to direct chat."""
|
|
benchmark = BenchmarkCase(
|
|
id="fallback-nomatch-001",
|
|
input="Tell me about quantum physics",
|
|
expected_skill=None,
|
|
expected_complexity="low",
|
|
category="routing",
|
|
subcategory="fallback",
|
|
paraphrases=["Explain quantum mechanics", "量子物理是什么"],
|
|
)
|
|
|
|
result = _run_benchmark_and_record(
|
|
benchmark,
|
|
api_client,
|
|
metrics_collector,
|
|
test_name="fallback_nomatch",
|
|
)
|
|
assert result["status_code"] == 200
|
|
|
|
for i, para in enumerate(benchmark.paraphrases):
|
|
_run_benchmark_and_record(
|
|
benchmark,
|
|
api_client,
|
|
metrics_collector,
|
|
test_name=f"fallback_nomatch_para_{i}",
|
|
is_paraphrase=True,
|
|
input_override=para,
|
|
)
|