fischer-agentkit/tests/e2e/test_capability_team.py

253 lines
9.9 KiB
Python

"""E2E Agent Capability Tests — Expert Team Collaboration with Metrics.
Tests the intelligence of expert team collaboration AND collects data for:
- Team formation accuracy
- Fallback effectiveness
- Expert coordination quality
- Overfitting detection via paraphrased inputs
"""
import pytest
import httpx
from tests.e2e.benchmark_dataset import TEAM_BENCHMARKS, BenchmarkCase
from tests.e2e.capability_metrics import MetricsCollector
from tests.e2e.conftest import register_skill_via_api
# ═══════════════════════════════════════════════════════════════════════════
# Helper: run team benchmark and record metrics
# ═══════════════════════════════════════════════════════════════════════════
def _run_team_benchmark(
benchmark: BenchmarkCase,
api_client: httpx.Client,
collector: MetricsCollector,
test_name: str,
is_paraphrase: bool = False,
input_override: str | None = None,
) -> dict:
"""Execute a team benchmark and record metrics."""
query = input_override or benchmark.input
collector.start_timer(benchmark.id)
payload: dict = {"input_data": {"query": query}}
if benchmark.expected_skill:
payload["skill_name"] = benchmark.expected_skill
resp = api_client.post("/api/v1/tasks", json=payload)
actual_skill = None
actual_exec_mode = None
actual_keys = []
task_succeeded = resp.status_code == 200
error_msg = None
if task_succeeded:
data = resp.json()
actual_skill = data.get("skill_name")
actual_exec_mode = data.get("execution_mode")
actual_keys = list(data.keys())
elif resp.status_code >= 400:
try:
error_msg = resp.json().get("detail", resp.text[:200])
except Exception:
error_msg = resp.text[:200]
collector.record_benchmark_result(
benchmark,
test_name=test_name,
actual_skill=actual_skill,
actual_execution_mode=actual_exec_mode,
actual_status_code=resp.status_code,
actual_response_keys=actual_keys,
task_succeeded=task_succeeded,
is_paraphrase=is_paraphrase,
error_message=error_msg,
)
return {
"status_code": resp.status_code,
"actual_skill": actual_skill,
"actual_exec_mode": actual_exec_mode,
"task_succeeded": task_succeeded,
}
# ═══════════════════════════════════════════════════════════════════════════
# Parameterized Team Benchmark Tests
# ═══════════════════════════════════════════════════════════════════════════
@pytest.mark.e2e_capability
class TestTeamBenchmarks:
"""Run all team benchmarks with metrics collection."""
@pytest.mark.parametrize(
"benchmark",
TEAM_BENCHMARKS,
ids=[b.id for b in TEAM_BENCHMARKS],
)
def test_team_benchmark(
self,
benchmark: BenchmarkCase,
api_client: httpx.Client,
metrics_collector: MetricsCollector,
):
"""Run original team benchmark and record metrics."""
if benchmark.expected_skill:
register_skill_via_api(
api_client,
benchmark.expected_skill,
keywords=[benchmark.expected_skill],
)
result = _run_team_benchmark(
benchmark,
api_client,
metrics_collector,
test_name=f"team_benchmark_{benchmark.id}",
)
assert result["status_code"] == 200, f"Team benchmark {benchmark.id} failed"
@pytest.mark.parametrize(
"benchmark",
[b for b in TEAM_BENCHMARKS if b.paraphrases],
ids=[b.id for b in TEAM_BENCHMARKS if b.paraphrases],
)
def test_team_paraphrase(
self,
benchmark: BenchmarkCase,
api_client: httpx.Client,
metrics_collector: MetricsCollector,
):
"""Run paraphrases for overfitting detection."""
for i, paraphrase in enumerate(benchmark.paraphrases):
_run_team_benchmark(
benchmark,
api_client,
metrics_collector,
test_name=f"team_paraphrase_{benchmark.id}_{i}",
is_paraphrase=True,
input_override=paraphrase,
)
# ═══════════════════════════════════════════════════════════════════════════
# Team Formation Intelligence
# ═══════════════════════════════════════════════════════════════════════════
@pytest.mark.e2e_capability
class TestTeamFormation:
"""Test that teams are formed intelligently based on task requirements."""
def test_explicit_team_prefix(
self,
api_client: httpx.Client,
metrics_collector: MetricsCollector,
):
"""@team prefix should trigger team collaboration mode."""
register_skill_via_api(api_client, "team_analyst", keywords=["team_analyst", "analyze"])
register_skill_via_api(api_client, "team_writer", keywords=["team_writer", "write"])
benchmark = BenchmarkCase(
id="team-explicit-001",
input="Analyze the data and write a report",
expected_skill="team_analyst",
expected_execution_mode="react",
expected_complexity="high",
category="team",
subcategory="explicit_team",
paraphrases=["I need analysis and a written report", "分析数据并写报告"],
)
result = _run_team_benchmark(
benchmark,
api_client,
metrics_collector,
test_name="team_explicit",
)
assert result["status_code"] == 200
for i, para in enumerate(benchmark.paraphrases):
_run_team_benchmark(
benchmark,
api_client,
metrics_collector,
test_name=f"team_explicit_para_{i}",
is_paraphrase=True,
input_override=para,
)
# ═══════════════════════════════════════════════════════════════════════════
# Fallback Intelligence
# ═══════════════════════════════════════════════════════════════════════════
@pytest.mark.e2e_capability
class TestTeamFallback:
"""Test that team collaboration falls back gracefully on failure."""
def test_fallback_to_single_agent_on_team_failure(
self,
api_client: httpx.Client,
metrics_collector: MetricsCollector,
):
"""If team collaboration fails, should fall back to single agent."""
register_skill_via_api(api_client, "fallback_skill", keywords=["fallback_test"])
benchmark = BenchmarkCase(
id="team-fallback-001",
input="Complex task that might need fallback",
expected_skill="fallback_skill",
expected_complexity="high",
category="team",
subcategory="fallback",
paraphrases=["Difficult task requiring fallback mechanism", "需要回退机制的复杂任务"],
)
result = _run_team_benchmark(
benchmark,
api_client,
metrics_collector,
test_name="team_fallback",
)
assert result["status_code"] == 200
for i, para in enumerate(benchmark.paraphrases):
_run_team_benchmark(
benchmark,
api_client,
metrics_collector,
test_name=f"team_fallback_para_{i}",
is_paraphrase=True,
input_override=para,
)
# ═══════════════════════════════════════════════════════════════════════════
# Expert Name Validation
# ═══════════════════════════════════════════════════════════════════════════
@pytest.mark.e2e_capability
class TestExpertNameValidation:
"""Test that expert names are validated according to project rules."""
def test_valid_expert_names(self, api_client: httpx.Client):
"""Valid expert names (alphanumeric, dash, underscore) should work."""
for name in ["analyst", "data-scientist", "code_reviewer", "expert-123"]:
resp = register_skill_via_api(api_client, name, keywords=[name])
assert resp.status_code in (200, 201, 409), f"Failed for name: {name}"
def test_invalid_expert_name_rejected(self, api_client: httpx.Client):
"""Invalid expert names should be rejected."""
for name in ["expert with spaces", "expert@special", "", "a" * 65]:
resp = register_skill_via_api(api_client, name, keywords=[name])
assert resp.status_code in (200, 201, 400, 409, 422), (
f"Unexpected status for name: '{name}'"
)