fischer-agentkit/tests/e2e/test_capability_react.py

325 lines
12 KiB
Python

"""E2E Agent Capability Tests — ReAct Reasoning & Execution with Metrics.
Tests the intelligence of agent execution AND collects data for:
- Execution mode selection accuracy
- Quality gate effectiveness
- Task success rate by mode
- Output standardization consistency
- Overfitting detection via paraphrased inputs
"""
import pytest
import httpx
from tests.e2e.benchmark_dataset import EXECUTION_BENCHMARKS, BenchmarkCase
from tests.e2e.capability_metrics import MetricsCollector
from tests.e2e.conftest import register_skill_via_api
# ═══════════════════════════════════════════════════════════════════════════
# Helper: run execution benchmark and record metrics
# ═══════════════════════════════════════════════════════════════════════════
def _run_exec_benchmark(
benchmark: BenchmarkCase,
api_client: httpx.Client,
collector: MetricsCollector,
test_name: str,
is_paraphrase: bool = False,
input_override: str | None = None,
) -> dict:
"""Execute an execution benchmark and record metrics."""
query = input_override or benchmark.input
collector.start_timer(benchmark.id)
payload: dict = {"input_data": {"query": query}}
if benchmark.expected_skill is not None:
payload["skill_name"] = benchmark.expected_skill
resp = api_client.post("/api/v1/tasks", json=payload)
actual_skill = None
actual_exec_mode = None
actual_keys = []
task_succeeded = resp.status_code == 200
error_msg = None
if task_succeeded:
data = resp.json()
actual_skill = data.get("skill_name")
actual_exec_mode = data.get("execution_mode")
actual_keys = list(data.keys())
elif resp.status_code >= 400:
try:
error_msg = resp.json().get("detail", resp.text[:200])
except Exception:
error_msg = resp.text[:200]
collector.record_benchmark_result(
benchmark,
test_name=test_name,
actual_skill=actual_skill,
actual_execution_mode=actual_exec_mode,
actual_status_code=resp.status_code,
actual_response_keys=actual_keys,
task_succeeded=task_succeeded,
is_paraphrase=is_paraphrase,
error_message=error_msg,
)
return {
"status_code": resp.status_code,
"actual_skill": actual_skill,
"actual_exec_mode": actual_exec_mode,
"actual_keys": actual_keys,
"task_succeeded": task_succeeded,
}
# ═══════════════════════════════════════════════════════════════════════════
# Parameterized Execution Benchmark Tests
# ═══════════════════════════════════════════════════════════════════════════
@pytest.mark.e2e_capability
class TestExecutionBenchmarks:
"""Run all execution benchmarks with metrics collection."""
@pytest.mark.parametrize(
"benchmark",
EXECUTION_BENCHMARKS,
ids=[b.id for b in EXECUTION_BENCHMARKS],
)
def test_execution_benchmark(
self,
benchmark: BenchmarkCase,
api_client: httpx.Client,
metrics_collector: MetricsCollector,
):
"""Run original execution benchmark and record metrics."""
# Register the skill if expected
if benchmark.expected_skill:
exec_mode = (
benchmark.expected_execution_mode
if benchmark.expected_execution_mode != "direct"
else "direct"
)
register_skill_via_api(
api_client,
benchmark.expected_skill,
keywords=[benchmark.expected_skill],
execution_mode=exec_mode,
)
result = _run_exec_benchmark(
benchmark,
api_client,
metrics_collector,
test_name=f"exec_benchmark_{benchmark.id}",
)
assert result["status_code"] == 200, f"Benchmark {benchmark.id} failed"
@pytest.mark.parametrize(
"benchmark",
[b for b in EXECUTION_BENCHMARKS if b.paraphrases],
ids=[b.id for b in EXECUTION_BENCHMARKS if b.paraphrases],
)
def test_execution_paraphrase(
self,
benchmark: BenchmarkCase,
api_client: httpx.Client,
metrics_collector: MetricsCollector,
):
"""Run paraphrases for overfitting detection."""
for i, paraphrase in enumerate(benchmark.paraphrases):
_run_exec_benchmark(
benchmark,
api_client,
metrics_collector,
test_name=f"exec_paraphrase_{benchmark.id}_{i}",
is_paraphrase=True,
input_override=paraphrase,
)
# ═══════════════════════════════════════════════════════════════════════════
# ReAct Loop Intelligence
# ═══════════════════════════════════════════════════════════════════════════
@pytest.mark.e2e_capability
class TestReActIntelligence:
"""Test that ReAct agents reason correctly through Think→Act→Observe."""
def test_react_skill_executes_steps(
self,
api_client: httpx.Client,
metrics_collector: MetricsCollector,
):
"""ReAct skill should execute multiple steps for complex tasks."""
benchmark = BenchmarkCase(
id="react-steps-001",
input="Research and analyze the impact of AI on healthcare",
expected_skill="react_reasoner",
expected_execution_mode="react",
expected_complexity="high",
category="execution",
subcategory="react_mode",
paraphrases=["Investigate AI's effect on medical industry", "调研AI对医疗行业的影响"],
)
register_skill_via_api(
api_client,
"react_reasoner",
keywords=["react_reason", "research", "analyze"],
execution_mode="react",
)
result = _run_exec_benchmark(
benchmark,
api_client,
metrics_collector,
test_name="react_steps",
)
assert result["status_code"] == 200
for i, para in enumerate(benchmark.paraphrases):
_run_exec_benchmark(
benchmark,
api_client,
metrics_collector,
test_name=f"react_steps_para_{i}",
is_paraphrase=True,
input_override=para,
)
# ═══════════════════════════════════════════════════════════════════════════
# Quality Gate Intelligence
# ═══════════════════════════════════════════════════════════════════════════
@pytest.mark.e2e_capability
class TestQualityGateIntelligence:
"""Test that quality gate correctly validates and retries outputs."""
def test_quality_gate_with_required_fields(
self,
api_client: httpx.Client,
metrics_collector: MetricsCollector,
):
"""Quality gate should enforce required_fields in output."""
benchmark = BenchmarkCase(
id="quality-fields-001",
input="Generate content with quality check",
expected_skill="quality_skill",
expected_complexity="medium",
category="execution",
subcategory="quality_gate",
)
register_skill_via_api(api_client, "quality_skill", keywords=["quality_test"])
result = _run_exec_benchmark(
benchmark,
api_client,
metrics_collector,
test_name="quality_fields",
)
assert result["status_code"] in (200, 400, 422)
def test_quality_gate_rejects_empty_output(
self,
api_client: httpx.Client,
metrics_collector: MetricsCollector,
):
"""Quality gate should reject empty or minimal output."""
benchmark = BenchmarkCase(
id="quality-empty-001",
input="",
expected_skill="quality_empty",
expected_complexity="low",
category="execution",
subcategory="quality_gate",
)
register_skill_via_api(api_client, "quality_empty", keywords=["quality_empty"])
result = _run_exec_benchmark(
benchmark,
api_client,
metrics_collector,
test_name="quality_empty",
)
# Should handle gracefully
assert result["status_code"] in (200, 400, 422)
# ═══════════════════════════════════════════════════════════════════════════
# Output Standardization Intelligence
# ═══════════════════════════════════════════════════════════════════════════
@pytest.mark.e2e_capability
class TestOutputStandardization:
"""Test that agent outputs are properly standardized."""
def test_output_has_required_structure(
self,
api_client: httpx.Client,
metrics_collector: MetricsCollector,
):
"""Task results should have a consistent structure."""
register_skill_via_api(api_client, "output_std_skill", keywords=["output_std"])
benchmark = BenchmarkCase(
id="output-std-001",
input="Test output standardization",
expected_skill="output_std_skill",
expected_complexity="low",
category="execution",
subcategory="output_std",
)
result = _run_exec_benchmark(
benchmark,
api_client,
metrics_collector,
test_name="output_std",
)
assert result["status_code"] == 200
assert result["task_succeeded"]
def test_different_skills_produce_consistent_format(
self,
api_client: httpx.Client,
metrics_collector: MetricsCollector,
):
"""Different skills should produce results in consistent format."""
register_skill_via_api(api_client, "format_skill_a", keywords=["format_a"])
register_skill_via_api(api_client, "format_skill_b", keywords=["format_b"])
bench_a = BenchmarkCase(
id="format-a-001",
input="Test format A",
expected_skill="format_skill_a",
expected_complexity="low",
category="execution",
subcategory="output_std",
)
bench_b = BenchmarkCase(
id="format-b-001",
input="Test format B",
expected_skill="format_skill_b",
expected_complexity="low",
category="execution",
subcategory="output_std",
)
result_a = _run_exec_benchmark(bench_a, api_client, metrics_collector, test_name="format_a")
result_b = _run_exec_benchmark(bench_b, api_client, metrics_collector, test_name="format_b")
if result_a["task_succeeded"] and result_b["task_succeeded"]:
# Both should have some common response keys
keys_a = set(result_a["actual_keys"])
keys_b = set(result_b["actual_keys"])
assert len(keys_a & keys_b) > 0 or len(keys_a) > 0