325 lines
12 KiB
Python
325 lines
12 KiB
Python
"""E2E Agent Capability Tests — ReAct Reasoning & Execution with Metrics.
|
|
|
|
Tests the intelligence of agent execution AND collects data for:
|
|
- Execution mode selection accuracy
|
|
- Quality gate effectiveness
|
|
- Task success rate by mode
|
|
- Output standardization consistency
|
|
- Overfitting detection via paraphrased inputs
|
|
"""
|
|
|
|
import pytest
|
|
import httpx
|
|
|
|
from tests.e2e.benchmark_dataset import EXECUTION_BENCHMARKS, BenchmarkCase
|
|
from tests.e2e.capability_metrics import MetricsCollector
|
|
from tests.e2e.conftest import register_skill_via_api
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Helper: run execution benchmark and record metrics
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
def _run_exec_benchmark(
|
|
benchmark: BenchmarkCase,
|
|
api_client: httpx.Client,
|
|
collector: MetricsCollector,
|
|
test_name: str,
|
|
is_paraphrase: bool = False,
|
|
input_override: str | None = None,
|
|
) -> dict:
|
|
"""Execute an execution benchmark and record metrics."""
|
|
query = input_override or benchmark.input
|
|
collector.start_timer(benchmark.id)
|
|
|
|
payload: dict = {"input_data": {"query": query}}
|
|
if benchmark.expected_skill is not None:
|
|
payload["skill_name"] = benchmark.expected_skill
|
|
|
|
resp = api_client.post("/api/v1/tasks", json=payload)
|
|
|
|
actual_skill = None
|
|
actual_exec_mode = None
|
|
actual_keys = []
|
|
task_succeeded = resp.status_code == 200
|
|
error_msg = None
|
|
|
|
if task_succeeded:
|
|
data = resp.json()
|
|
actual_skill = data.get("skill_name")
|
|
actual_exec_mode = data.get("execution_mode")
|
|
actual_keys = list(data.keys())
|
|
elif resp.status_code >= 400:
|
|
try:
|
|
error_msg = resp.json().get("detail", resp.text[:200])
|
|
except Exception:
|
|
error_msg = resp.text[:200]
|
|
|
|
collector.record_benchmark_result(
|
|
benchmark,
|
|
test_name=test_name,
|
|
actual_skill=actual_skill,
|
|
actual_execution_mode=actual_exec_mode,
|
|
actual_status_code=resp.status_code,
|
|
actual_response_keys=actual_keys,
|
|
task_succeeded=task_succeeded,
|
|
is_paraphrase=is_paraphrase,
|
|
error_message=error_msg,
|
|
)
|
|
|
|
return {
|
|
"status_code": resp.status_code,
|
|
"actual_skill": actual_skill,
|
|
"actual_exec_mode": actual_exec_mode,
|
|
"actual_keys": actual_keys,
|
|
"task_succeeded": task_succeeded,
|
|
}
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Parameterized Execution Benchmark Tests
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
@pytest.mark.e2e_capability
|
|
class TestExecutionBenchmarks:
|
|
"""Run all execution benchmarks with metrics collection."""
|
|
|
|
@pytest.mark.parametrize(
|
|
"benchmark",
|
|
EXECUTION_BENCHMARKS,
|
|
ids=[b.id for b in EXECUTION_BENCHMARKS],
|
|
)
|
|
def test_execution_benchmark(
|
|
self,
|
|
benchmark: BenchmarkCase,
|
|
api_client: httpx.Client,
|
|
metrics_collector: MetricsCollector,
|
|
):
|
|
"""Run original execution benchmark and record metrics."""
|
|
# Register the skill if expected
|
|
if benchmark.expected_skill:
|
|
exec_mode = (
|
|
benchmark.expected_execution_mode
|
|
if benchmark.expected_execution_mode != "direct"
|
|
else "direct"
|
|
)
|
|
register_skill_via_api(
|
|
api_client,
|
|
benchmark.expected_skill,
|
|
keywords=[benchmark.expected_skill],
|
|
execution_mode=exec_mode,
|
|
)
|
|
|
|
result = _run_exec_benchmark(
|
|
benchmark,
|
|
api_client,
|
|
metrics_collector,
|
|
test_name=f"exec_benchmark_{benchmark.id}",
|
|
)
|
|
assert result["status_code"] == 200, f"Benchmark {benchmark.id} failed"
|
|
|
|
@pytest.mark.parametrize(
|
|
"benchmark",
|
|
[b for b in EXECUTION_BENCHMARKS if b.paraphrases],
|
|
ids=[b.id for b in EXECUTION_BENCHMARKS if b.paraphrases],
|
|
)
|
|
def test_execution_paraphrase(
|
|
self,
|
|
benchmark: BenchmarkCase,
|
|
api_client: httpx.Client,
|
|
metrics_collector: MetricsCollector,
|
|
):
|
|
"""Run paraphrases for overfitting detection."""
|
|
for i, paraphrase in enumerate(benchmark.paraphrases):
|
|
_run_exec_benchmark(
|
|
benchmark,
|
|
api_client,
|
|
metrics_collector,
|
|
test_name=f"exec_paraphrase_{benchmark.id}_{i}",
|
|
is_paraphrase=True,
|
|
input_override=paraphrase,
|
|
)
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# ReAct Loop Intelligence
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
@pytest.mark.e2e_capability
|
|
class TestReActIntelligence:
|
|
"""Test that ReAct agents reason correctly through Think→Act→Observe."""
|
|
|
|
def test_react_skill_executes_steps(
|
|
self,
|
|
api_client: httpx.Client,
|
|
metrics_collector: MetricsCollector,
|
|
):
|
|
"""ReAct skill should execute multiple steps for complex tasks."""
|
|
benchmark = BenchmarkCase(
|
|
id="react-steps-001",
|
|
input="Research and analyze the impact of AI on healthcare",
|
|
expected_skill="react_reasoner",
|
|
expected_execution_mode="react",
|
|
expected_complexity="high",
|
|
category="execution",
|
|
subcategory="react_mode",
|
|
paraphrases=["Investigate AI's effect on medical industry", "调研AI对医疗行业的影响"],
|
|
)
|
|
register_skill_via_api(
|
|
api_client,
|
|
"react_reasoner",
|
|
keywords=["react_reason", "research", "analyze"],
|
|
execution_mode="react",
|
|
)
|
|
|
|
result = _run_exec_benchmark(
|
|
benchmark,
|
|
api_client,
|
|
metrics_collector,
|
|
test_name="react_steps",
|
|
)
|
|
assert result["status_code"] == 200
|
|
|
|
for i, para in enumerate(benchmark.paraphrases):
|
|
_run_exec_benchmark(
|
|
benchmark,
|
|
api_client,
|
|
metrics_collector,
|
|
test_name=f"react_steps_para_{i}",
|
|
is_paraphrase=True,
|
|
input_override=para,
|
|
)
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Quality Gate Intelligence
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
@pytest.mark.e2e_capability
|
|
class TestQualityGateIntelligence:
|
|
"""Test that quality gate correctly validates and retries outputs."""
|
|
|
|
def test_quality_gate_with_required_fields(
|
|
self,
|
|
api_client: httpx.Client,
|
|
metrics_collector: MetricsCollector,
|
|
):
|
|
"""Quality gate should enforce required_fields in output."""
|
|
benchmark = BenchmarkCase(
|
|
id="quality-fields-001",
|
|
input="Generate content with quality check",
|
|
expected_skill="quality_skill",
|
|
expected_complexity="medium",
|
|
category="execution",
|
|
subcategory="quality_gate",
|
|
)
|
|
register_skill_via_api(api_client, "quality_skill", keywords=["quality_test"])
|
|
|
|
result = _run_exec_benchmark(
|
|
benchmark,
|
|
api_client,
|
|
metrics_collector,
|
|
test_name="quality_fields",
|
|
)
|
|
assert result["status_code"] in (200, 400, 422)
|
|
|
|
def test_quality_gate_rejects_empty_output(
|
|
self,
|
|
api_client: httpx.Client,
|
|
metrics_collector: MetricsCollector,
|
|
):
|
|
"""Quality gate should reject empty or minimal output."""
|
|
benchmark = BenchmarkCase(
|
|
id="quality-empty-001",
|
|
input="",
|
|
expected_skill="quality_empty",
|
|
expected_complexity="low",
|
|
category="execution",
|
|
subcategory="quality_gate",
|
|
)
|
|
register_skill_via_api(api_client, "quality_empty", keywords=["quality_empty"])
|
|
|
|
result = _run_exec_benchmark(
|
|
benchmark,
|
|
api_client,
|
|
metrics_collector,
|
|
test_name="quality_empty",
|
|
)
|
|
# Should handle gracefully
|
|
assert result["status_code"] in (200, 400, 422)
|
|
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
# Output Standardization Intelligence
|
|
# ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
|
|
@pytest.mark.e2e_capability
|
|
class TestOutputStandardization:
|
|
"""Test that agent outputs are properly standardized."""
|
|
|
|
def test_output_has_required_structure(
|
|
self,
|
|
api_client: httpx.Client,
|
|
metrics_collector: MetricsCollector,
|
|
):
|
|
"""Task results should have a consistent structure."""
|
|
register_skill_via_api(api_client, "output_std_skill", keywords=["output_std"])
|
|
|
|
benchmark = BenchmarkCase(
|
|
id="output-std-001",
|
|
input="Test output standardization",
|
|
expected_skill="output_std_skill",
|
|
expected_complexity="low",
|
|
category="execution",
|
|
subcategory="output_std",
|
|
)
|
|
|
|
result = _run_exec_benchmark(
|
|
benchmark,
|
|
api_client,
|
|
metrics_collector,
|
|
test_name="output_std",
|
|
)
|
|
assert result["status_code"] == 200
|
|
assert result["task_succeeded"]
|
|
|
|
def test_different_skills_produce_consistent_format(
|
|
self,
|
|
api_client: httpx.Client,
|
|
metrics_collector: MetricsCollector,
|
|
):
|
|
"""Different skills should produce results in consistent format."""
|
|
register_skill_via_api(api_client, "format_skill_a", keywords=["format_a"])
|
|
register_skill_via_api(api_client, "format_skill_b", keywords=["format_b"])
|
|
|
|
bench_a = BenchmarkCase(
|
|
id="format-a-001",
|
|
input="Test format A",
|
|
expected_skill="format_skill_a",
|
|
expected_complexity="low",
|
|
category="execution",
|
|
subcategory="output_std",
|
|
)
|
|
bench_b = BenchmarkCase(
|
|
id="format-b-001",
|
|
input="Test format B",
|
|
expected_skill="format_skill_b",
|
|
expected_complexity="low",
|
|
category="execution",
|
|
subcategory="output_std",
|
|
)
|
|
|
|
result_a = _run_exec_benchmark(bench_a, api_client, metrics_collector, test_name="format_a")
|
|
result_b = _run_exec_benchmark(bench_b, api_client, metrics_collector, test_name="format_b")
|
|
|
|
if result_a["task_succeeded"] and result_b["task_succeeded"]:
|
|
# Both should have some common response keys
|
|
keys_a = set(result_a["actual_keys"])
|
|
keys_b = set(result_b["actual_keys"])
|
|
assert len(keys_a & keys_b) > 0 or len(keys_a) > 0
|