253 lines
9.9 KiB
Python
253 lines
9.9 KiB
Python
"""E2E Agent Capability Tests — Expert Team Collaboration with Metrics.
|
|
|
|
Tests the intelligence of expert team collaboration AND collects data for:
|
|
- Team formation accuracy
|
|
- Fallback effectiveness
|
|
- Expert coordination quality
|
|
- Overfitting detection via paraphrased inputs
|
|
"""
|
|
|
|
import pytest
|
|
import httpx
|
|
|
|
from tests.e2e.benchmark_dataset import TEAM_BENCHMARKS, BenchmarkCase
|
|
from tests.e2e.capability_metrics import MetricsCollector
|
|
from tests.e2e.conftest import register_skill_via_api
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Helper: run team benchmark and record metrics
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
def _run_team_benchmark(
|
|
benchmark: BenchmarkCase,
|
|
api_client: httpx.Client,
|
|
collector: MetricsCollector,
|
|
test_name: str,
|
|
is_paraphrase: bool = False,
|
|
input_override: str | None = None,
|
|
) -> dict:
|
|
"""Execute a team benchmark and record metrics."""
|
|
query = input_override or benchmark.input
|
|
collector.start_timer(benchmark.id)
|
|
|
|
payload: dict = {"input_data": {"query": query}}
|
|
if benchmark.expected_skill:
|
|
payload["skill_name"] = benchmark.expected_skill
|
|
|
|
resp = api_client.post("/api/v1/tasks", json=payload)
|
|
|
|
actual_skill = None
|
|
actual_exec_mode = None
|
|
actual_keys = []
|
|
task_succeeded = resp.status_code == 200
|
|
error_msg = None
|
|
|
|
if task_succeeded:
|
|
data = resp.json()
|
|
actual_skill = data.get("skill_name")
|
|
actual_exec_mode = data.get("execution_mode")
|
|
actual_keys = list(data.keys())
|
|
elif resp.status_code >= 400:
|
|
try:
|
|
error_msg = resp.json().get("detail", resp.text[:200])
|
|
except Exception:
|
|
error_msg = resp.text[:200]
|
|
|
|
collector.record_benchmark_result(
|
|
benchmark,
|
|
test_name=test_name,
|
|
actual_skill=actual_skill,
|
|
actual_execution_mode=actual_exec_mode,
|
|
actual_status_code=resp.status_code,
|
|
actual_response_keys=actual_keys,
|
|
task_succeeded=task_succeeded,
|
|
is_paraphrase=is_paraphrase,
|
|
error_message=error_msg,
|
|
)
|
|
|
|
return {
|
|
"status_code": resp.status_code,
|
|
"actual_skill": actual_skill,
|
|
"actual_exec_mode": actual_exec_mode,
|
|
"task_succeeded": task_succeeded,
|
|
}
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Parameterized Team Benchmark Tests
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
@pytest.mark.e2e_capability
|
|
class TestTeamBenchmarks:
|
|
"""Run all team benchmarks with metrics collection."""
|
|
|
|
@pytest.mark.parametrize(
|
|
"benchmark",
|
|
TEAM_BENCHMARKS,
|
|
ids=[b.id for b in TEAM_BENCHMARKS],
|
|
)
|
|
def test_team_benchmark(
|
|
self,
|
|
benchmark: BenchmarkCase,
|
|
api_client: httpx.Client,
|
|
metrics_collector: MetricsCollector,
|
|
):
|
|
"""Run original team benchmark and record metrics."""
|
|
if benchmark.expected_skill:
|
|
register_skill_via_api(
|
|
api_client,
|
|
benchmark.expected_skill,
|
|
keywords=[benchmark.expected_skill],
|
|
)
|
|
|
|
result = _run_team_benchmark(
|
|
benchmark,
|
|
api_client,
|
|
metrics_collector,
|
|
test_name=f"team_benchmark_{benchmark.id}",
|
|
)
|
|
assert result["status_code"] == 200, f"Team benchmark {benchmark.id} failed"
|
|
|
|
@pytest.mark.parametrize(
|
|
"benchmark",
|
|
[b for b in TEAM_BENCHMARKS if b.paraphrases],
|
|
ids=[b.id for b in TEAM_BENCHMARKS if b.paraphrases],
|
|
)
|
|
def test_team_paraphrase(
|
|
self,
|
|
benchmark: BenchmarkCase,
|
|
api_client: httpx.Client,
|
|
metrics_collector: MetricsCollector,
|
|
):
|
|
"""Run paraphrases for overfitting detection."""
|
|
for i, paraphrase in enumerate(benchmark.paraphrases):
|
|
_run_team_benchmark(
|
|
benchmark,
|
|
api_client,
|
|
metrics_collector,
|
|
test_name=f"team_paraphrase_{benchmark.id}_{i}",
|
|
is_paraphrase=True,
|
|
input_override=paraphrase,
|
|
)
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Team Formation Intelligence
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
@pytest.mark.e2e_capability
|
|
class TestTeamFormation:
|
|
"""Test that teams are formed intelligently based on task requirements."""
|
|
|
|
def test_explicit_team_prefix(
|
|
self,
|
|
api_client: httpx.Client,
|
|
metrics_collector: MetricsCollector,
|
|
):
|
|
"""@team prefix should trigger team collaboration mode."""
|
|
register_skill_via_api(api_client, "team_analyst", keywords=["team_analyst", "analyze"])
|
|
register_skill_via_api(api_client, "team_writer", keywords=["team_writer", "write"])
|
|
|
|
benchmark = BenchmarkCase(
|
|
id="team-explicit-001",
|
|
input="Analyze the data and write a report",
|
|
expected_skill="team_analyst",
|
|
expected_execution_mode="react",
|
|
expected_complexity="high",
|
|
category="team",
|
|
subcategory="explicit_team",
|
|
paraphrases=["I need analysis and a written report", "分析数据并写报告"],
|
|
)
|
|
|
|
result = _run_team_benchmark(
|
|
benchmark,
|
|
api_client,
|
|
metrics_collector,
|
|
test_name="team_explicit",
|
|
)
|
|
assert result["status_code"] == 200
|
|
|
|
for i, para in enumerate(benchmark.paraphrases):
|
|
_run_team_benchmark(
|
|
benchmark,
|
|
api_client,
|
|
metrics_collector,
|
|
test_name=f"team_explicit_para_{i}",
|
|
is_paraphrase=True,
|
|
input_override=para,
|
|
)
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Fallback Intelligence
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
@pytest.mark.e2e_capability
|
|
class TestTeamFallback:
|
|
"""Test that team collaboration falls back gracefully on failure."""
|
|
|
|
def test_fallback_to_single_agent_on_team_failure(
|
|
self,
|
|
api_client: httpx.Client,
|
|
metrics_collector: MetricsCollector,
|
|
):
|
|
"""If team collaboration fails, should fall back to single agent."""
|
|
register_skill_via_api(api_client, "fallback_skill", keywords=["fallback_test"])
|
|
|
|
benchmark = BenchmarkCase(
|
|
id="team-fallback-001",
|
|
input="Complex task that might need fallback",
|
|
expected_skill="fallback_skill",
|
|
expected_complexity="high",
|
|
category="team",
|
|
subcategory="fallback",
|
|
paraphrases=["Difficult task requiring fallback mechanism", "需要回退机制的复杂任务"],
|
|
)
|
|
|
|
result = _run_team_benchmark(
|
|
benchmark,
|
|
api_client,
|
|
metrics_collector,
|
|
test_name="team_fallback",
|
|
)
|
|
assert result["status_code"] == 200
|
|
|
|
for i, para in enumerate(benchmark.paraphrases):
|
|
_run_team_benchmark(
|
|
benchmark,
|
|
api_client,
|
|
metrics_collector,
|
|
test_name=f"team_fallback_para_{i}",
|
|
is_paraphrase=True,
|
|
input_override=para,
|
|
)
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Expert Name Validation
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
@pytest.mark.e2e_capability
|
|
class TestExpertNameValidation:
|
|
"""Test that expert names are validated according to project rules."""
|
|
|
|
def test_valid_expert_names(self, api_client: httpx.Client):
|
|
"""Valid expert names (alphanumeric, dash, underscore) should work."""
|
|
for name in ["analyst", "data-scientist", "code_reviewer", "expert-123"]:
|
|
resp = register_skill_via_api(api_client, name, keywords=[name])
|
|
assert resp.status_code in (200, 201, 409), f"Failed for name: {name}"
|
|
|
|
def test_invalid_expert_name_rejected(self, api_client: httpx.Client):
|
|
"""Invalid expert names should be rejected."""
|
|
for name in ["expert with spaces", "expert@special", "", "a" * 65]:
|
|
resp = register_skill_via_api(api_client, name, keywords=[name])
|
|
assert resp.status_code in (200, 201, 400, 409, 422), (
|
|
f"Unexpected status for name: '{name}'"
|
|
)
|