"""E2E Agent Capability Tests — ReAct Reasoning & Execution with Metrics. Tests the intelligence of agent execution AND collects data for: - Execution mode selection accuracy - Quality gate effectiveness - Task success rate by mode - Output standardization consistency - Overfitting detection via paraphrased inputs """ import pytest import httpx from tests.e2e.benchmark_dataset import EXECUTION_BENCHMARKS, BenchmarkCase from tests.e2e.capability_metrics import MetricsCollector from tests.e2e.conftest import register_skill_via_api # ═══════════════════════════════════════════════════════════════════════════ # Helper: run execution benchmark and record metrics # ═══════════════════════════════════════════════════════════════════════════ def _run_exec_benchmark( benchmark: BenchmarkCase, api_client: httpx.Client, collector: MetricsCollector, test_name: str, is_paraphrase: bool = False, input_override: str | None = None, ) -> dict: """Execute an execution benchmark and record metrics.""" query = input_override or benchmark.input collector.start_timer(benchmark.id) payload: dict = {"input_data": {"query": query}} if benchmark.expected_skill is not None: payload["skill_name"] = benchmark.expected_skill resp = api_client.post("/api/v1/tasks", json=payload) actual_skill = None actual_exec_mode = None actual_keys = [] task_succeeded = resp.status_code == 200 error_msg = None if task_succeeded: data = resp.json() actual_skill = data.get("skill_name") actual_exec_mode = data.get("execution_mode") actual_keys = list(data.keys()) elif resp.status_code >= 400: try: error_msg = resp.json().get("detail", resp.text[:200]) except Exception: error_msg = resp.text[:200] collector.record_benchmark_result( benchmark, test_name=test_name, actual_skill=actual_skill, actual_execution_mode=actual_exec_mode, actual_status_code=resp.status_code, actual_response_keys=actual_keys, task_succeeded=task_succeeded, is_paraphrase=is_paraphrase, error_message=error_msg, ) return { "status_code": resp.status_code, "actual_skill": actual_skill, "actual_exec_mode": actual_exec_mode, "actual_keys": actual_keys, "task_succeeded": task_succeeded, } # ═══════════════════════════════════════════════════════════════════════════ # Parameterized Execution Benchmark Tests # ═══════════════════════════════════════════════════════════════════════════ @pytest.mark.e2e_capability class TestExecutionBenchmarks: """Run all execution benchmarks with metrics collection.""" @pytest.mark.parametrize( "benchmark", EXECUTION_BENCHMARKS, ids=[b.id for b in EXECUTION_BENCHMARKS], ) def test_execution_benchmark( self, benchmark: BenchmarkCase, api_client: httpx.Client, metrics_collector: MetricsCollector, ): """Run original execution benchmark and record metrics.""" # Register the skill if expected if benchmark.expected_skill: exec_mode = ( benchmark.expected_execution_mode if benchmark.expected_execution_mode != "direct" else "direct" ) register_skill_via_api( api_client, benchmark.expected_skill, keywords=[benchmark.expected_skill], execution_mode=exec_mode, ) result = _run_exec_benchmark( benchmark, api_client, metrics_collector, test_name=f"exec_benchmark_{benchmark.id}", ) assert result["status_code"] == 200, f"Benchmark {benchmark.id} failed" @pytest.mark.parametrize( "benchmark", [b for b in EXECUTION_BENCHMARKS if b.paraphrases], ids=[b.id for b in EXECUTION_BENCHMARKS if b.paraphrases], ) def test_execution_paraphrase( self, benchmark: BenchmarkCase, api_client: httpx.Client, metrics_collector: MetricsCollector, ): """Run paraphrases for overfitting detection.""" for i, paraphrase in enumerate(benchmark.paraphrases): _run_exec_benchmark( benchmark, api_client, metrics_collector, test_name=f"exec_paraphrase_{benchmark.id}_{i}", is_paraphrase=True, input_override=paraphrase, ) # ═══════════════════════════════════════════════════════════════════════════ # ReAct Loop Intelligence # ═══════════════════════════════════════════════════════════════════════════ @pytest.mark.e2e_capability class TestReActIntelligence: """Test that ReAct agents reason correctly through Think→Act→Observe.""" def test_react_skill_executes_steps( self, api_client: httpx.Client, metrics_collector: MetricsCollector, ): """ReAct skill should execute multiple steps for complex tasks.""" benchmark = BenchmarkCase( id="react-steps-001", input="Research and analyze the impact of AI on healthcare", expected_skill="react_reasoner", expected_execution_mode="react", expected_complexity="high", category="execution", subcategory="react_mode", paraphrases=["Investigate AI's effect on medical industry", "调研AI对医疗行业的影响"], ) register_skill_via_api( api_client, "react_reasoner", keywords=["react_reason", "research", "analyze"], execution_mode="react", ) result = _run_exec_benchmark( benchmark, api_client, metrics_collector, test_name="react_steps", ) assert result["status_code"] == 200 for i, para in enumerate(benchmark.paraphrases): _run_exec_benchmark( benchmark, api_client, metrics_collector, test_name=f"react_steps_para_{i}", is_paraphrase=True, input_override=para, ) # ═══════════════════════════════════════════════════════════════════════════ # Quality Gate Intelligence # ═══════════════════════════════════════════════════════════════════════════ @pytest.mark.e2e_capability class TestQualityGateIntelligence: """Test that quality gate correctly validates and retries outputs.""" def test_quality_gate_with_required_fields( self, api_client: httpx.Client, metrics_collector: MetricsCollector, ): """Quality gate should enforce required_fields in output.""" benchmark = BenchmarkCase( id="quality-fields-001", input="Generate content with quality check", expected_skill="quality_skill", expected_complexity="medium", category="execution", subcategory="quality_gate", ) register_skill_via_api(api_client, "quality_skill", keywords=["quality_test"]) result = _run_exec_benchmark( benchmark, api_client, metrics_collector, test_name="quality_fields", ) assert result["status_code"] in (200, 400, 422) def test_quality_gate_rejects_empty_output( self, api_client: httpx.Client, metrics_collector: MetricsCollector, ): """Quality gate should reject empty or minimal output.""" benchmark = BenchmarkCase( id="quality-empty-001", input="", expected_skill="quality_empty", expected_complexity="low", category="execution", subcategory="quality_gate", ) register_skill_via_api(api_client, "quality_empty", keywords=["quality_empty"]) result = _run_exec_benchmark( benchmark, api_client, metrics_collector, test_name="quality_empty", ) # Should handle gracefully assert result["status_code"] in (200, 400, 422) # ═══════════════════════════════════════════════════════════════════════════ # Output Standardization Intelligence # ═══════════════════════════════════════════════════════════════════════════ @pytest.mark.e2e_capability class TestOutputStandardization: """Test that agent outputs are properly standardized.""" def test_output_has_required_structure( self, api_client: httpx.Client, metrics_collector: MetricsCollector, ): """Task results should have a consistent structure.""" register_skill_via_api(api_client, "output_std_skill", keywords=["output_std"]) benchmark = BenchmarkCase( id="output-std-001", input="Test output standardization", expected_skill="output_std_skill", expected_complexity="low", category="execution", subcategory="output_std", ) result = _run_exec_benchmark( benchmark, api_client, metrics_collector, test_name="output_std", ) assert result["status_code"] == 200 assert result["task_succeeded"] def test_different_skills_produce_consistent_format( self, api_client: httpx.Client, metrics_collector: MetricsCollector, ): """Different skills should produce results in consistent format.""" register_skill_via_api(api_client, "format_skill_a", keywords=["format_a"]) register_skill_via_api(api_client, "format_skill_b", keywords=["format_b"]) bench_a = BenchmarkCase( id="format-a-001", input="Test format A", expected_skill="format_skill_a", expected_complexity="low", category="execution", subcategory="output_std", ) bench_b = BenchmarkCase( id="format-b-001", input="Test format B", expected_skill="format_skill_b", expected_complexity="low", category="execution", subcategory="output_std", ) result_a = _run_exec_benchmark(bench_a, api_client, metrics_collector, test_name="format_a") result_b = _run_exec_benchmark(bench_b, api_client, metrics_collector, test_name="format_b") if result_a["task_succeeded"] and result_b["task_succeeded"]: # Both should have some common response keys keys_a = set(result_a["actual_keys"]) keys_b = set(result_b["actual_keys"]) assert len(keys_a & keys_b) > 0 or len(keys_a) > 0