#!/usr/bin/env bash # ============================================================================= # Fischer AgentKit — E2E Backtest Runner # ============================================================================= # # Usage: # ./scripts/run_e2e.sh # Run all E2E tests # ./scripts/run_e2e.sh --basic # Run basic function tests only # ./scripts/run_e2e.sh --capability # Run agent capability tests only # ./scripts/run_e2e.sh --cli # Run CLI tests only # ./scripts/run_e2e.sh --api # Run API tests only # ./scripts/run_e2e.sh --ws # Run WebSocket tests only # ./scripts/run_e2e.sh --routing # Run routing intelligence tests # ./scripts/run_e2e.sh --react # Run ReAct intelligence tests # ./scripts/run_e2e.sh --team # Run team collaboration tests # ./scripts/run_e2e.sh --report # Generate HTML report # ./scripts/run_e2e.sh --analyze # Run capability tests + generate analysis report # ./scripts/run_e2e.sh --direct # Run router direct backtest only (no HTTP) # ./scripts/run_e2e.sh --alignment # Run alignment guard tests only # ./scripts/run_e2e.sh --full # Run all: API + direct + alignment # ./scripts/run_e2e.sh --baseline # Compare with last baseline report # # Environment: # E2E_PORT - Server port (default: 18765) # E2E_API_KEY - API key for auth (default: ak_live_e2e_test_key_...) # SKIP_SERVER - Set to "1" to skip server startup (use existing) # ============================================================================= set -euo pipefail # ── Configuration ──────────────────────────────────────────────────────────── SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" E2E_PORT="${E2E_PORT:-18765}" E2E_API_KEY="${E2E_API_KEY:-ak_live_e2e_test_key_000000000000000000000000000000000000000000000000}" REPORT_DIR="${PROJECT_ROOT}/test-results/e2e" SKIP_SERVER="${SKIP_SERVER:-0}" cd "$PROJECT_ROOT" # ── Colors ─────────────────────────────────────────────────────────────────── RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # ── Helper Functions ───────────────────────────────────────────────────────── info() { echo -e "${BLUE}[INFO]${NC} $*"; } ok() { echo -e "${GREEN}[OK]${NC} $*"; } warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } fail() { echo -e "${RED}[FAIL]${NC} $*"; } check_deps() { local missing=0 for cmd in python3; do if ! command -v "$cmd" &>/dev/null; then fail "Missing dependency: $cmd" missing=1 fi done if [ "$missing" -eq 1 ]; then exit 1 fi } wait_for_server() { local max_attempts=60 local attempt=0 info "Waiting for server on port $E2E_PORT..." while [ $attempt -lt $max_attempts ]; do if curl -s "http://127.0.0.1:$E2E_PORT/api/v1/health" &>/dev/null; then ok "Server is ready on port $E2E_PORT" return 0 fi attempt=$((attempt + 1)) sleep 0.5 done fail "Server failed to start within 30 seconds" return 1 } start_server() { if [ "$SKIP_SERVER" = "1" ]; then info "SKIP_SERVER=1, using existing server on port $E2E_PORT" if curl -s "http://127.0.0.1:$E2E_PORT/api/v1/health" &>/dev/null; then ok "Existing server is healthy" return 0 else fail "Existing server is not responding" return 1 fi fi info "Starting AgentKit E2E server on port $E2E_PORT..." export AGENTKIT_E2E_MODE=1 export AGENTKIT_WS_TIMEOUT=0 export AGENTKIT_API_KEY="$E2E_API_KEY" # Start server in background python3 -m agentkit.cli.main serve --host 127.0.0.1 --port "$E2E_PORT" & SERVER_PID=$! if wait_for_server; then return 0 else kill "$SERVER_PID" 2>/dev/null || true return 1 fi } stop_server() { if [ "$SKIP_SERVER" = "1" ]; then info "SKIP_SERVER=1, not stopping server" return 0 fi if [ -n "${SERVER_PID:-}" ]; then info "Stopping E2E server (PID: $SERVER_PID)..." kill "$SERVER_PID" 2>/dev/null || true wait "$SERVER_PID" 2>/dev/null || true ok "Server stopped" fi } # ── Test Selection ─────────────────────────────────────────────────────────── PYTEST_ARGS=("--timeout=120" "-v" "--tb=short" "-s") TEST_TARGET="tests/e2e/" GENERATE_REPORT=0 ANALYZE=0 SKIP_SERVER_FLAG=0 BASELINE_COMPARE=0 while [[ $# -gt 0 ]]; do case $1 in --basic) PYTEST_ARGS+=("-m" "e2e_basic") shift ;; --capability) PYTEST_ARGS+=("-m" "e2e_capability") shift ;; --cli) TEST_TARGET="tests/e2e/test_basic_cli.py" shift ;; --api) TEST_TARGET="tests/e2e/test_basic_api.py" shift ;; --ws) TEST_TARGET="tests/e2e/test_basic_websocket.py" shift ;; --routing) TEST_TARGET="tests/e2e/test_capability_routing.py" shift ;; --react) TEST_TARGET="tests/e2e/test_capability_react.py" shift ;; --team) TEST_TARGET="tests/e2e/test_capability_team.py" shift ;; --direct) # Router direct backtest — no HTTP server needed TEST_TARGET="tests/e2e/test_capability_router_direct.py" SKIP_SERVER_FLAG=1 shift ;; --alignment) # Alignment guard tests — no HTTP server needed TEST_TARGET="tests/e2e/test_capability_alignment.py" SKIP_SERVER_FLAG=1 shift ;; --full) # Run all capability tests: API + direct + alignment PYTEST_ARGS+=("-m" "e2e_capability") shift ;; --baseline) BASELINE_COMPARE=1 shift ;; --report) GENERATE_REPORT=1 shift ;; --analyze) ANALYZE=1 PYTEST_ARGS+=("-m" "e2e_capability") shift ;; --fast) PYTEST_ARGS+=("-x" "--timeout=30") shift ;; --help|-h) echo "Usage: $0 [--basic|--capability|--cli|--api|--ws|--routing|--react|--team|--direct|--alignment|--full|--baseline|--report|--analyze|--fast]" exit 0 ;; *) PYTEST_ARGS+=("$1") shift ;; esac done if [ "$GENERATE_REPORT" -eq 1 ]; then mkdir -p "$REPORT_DIR" PYTEST_ARGS+=( "--html=$REPORT_DIR/e2e_report.html" "--self-contained-html" "--junitxml=$REPORT_DIR/e2e_junit.xml" ) fi if [ "$ANALYZE" -eq 1 ]; then info "Analysis mode: will generate capability report with recall/F1/overfitting analysis" fi # Override SKIP_SERVER when --direct or --alignment is used (no HTTP needed) if [ "$SKIP_SERVER_FLAG" -eq 1 ]; then SKIP_SERVER=1 fi # ── Main ───────────────────────────────────────────────────────────────────── info "Fischer AgentKit E2E Backtest Runner" info "=====================================" info "Project: $PROJECT_ROOT" info "Port: $E2E_PORT" info "Target: $TEST_TARGET" info "" check_deps # Trap to ensure server cleanup trap stop_server EXIT INT TERM if ! start_server; then fail "Could not start E2E server" exit 1 fi info "" info "Running E2E tests..." info "====================" info "" export AGENTKIT_SERVER_URL="http://127.0.0.1:$E2E_PORT" export AGENTKIT_API_KEY="$E2E_API_KEY" EXIT_CODE=0 python3 -m pytest "$TEST_TARGET" "${PYTEST_ARGS[@]}" || EXIT_CODE=$? echo "" if [ $EXIT_CODE -eq 0 ]; then ok "All E2E tests passed!" else fail "Some E2E tests failed (exit code: $EXIT_CODE)" fi if [ "$GENERATE_REPORT" -eq 1 ]; then info "Report generated at: $REPORT_DIR/e2e_report.html" fi if [ "$ANALYZE" -eq 1 ]; then CAPABILITY_REPORT="$PROJECT_ROOT/test-results/e2e/capability_report.txt" if [ -f "$CAPABILITY_REPORT" ]; then info "Capability analysis report:" echo "" cat "$CAPABILITY_REPORT" else warn "Capability report not found (may need capability tests to run first)" fi fi if [ "$BASELINE_COMPARE" -eq 1 ]; then CURRENT_REPORT="$PROJECT_ROOT/test-results/e2e/capability_report.json" BASELINE_REPORT="$PROJECT_ROOT/test-results/e2e/baseline_capability_report.json" if [ -f "$CURRENT_REPORT" ] && [ -f "$BASELINE_REPORT" ]; then info "Baseline comparison:" python3 -c " import json, sys def load_metrics(path): with open(path) as f: return json.load(f) cur = load_metrics('$CURRENT_REPORT') base = load_metrics('$BASELINE_REPORT') metrics = [ ('overall_skill_recall', '技能路由召回率'), ('overall_skill_precision', '技能路由精确率'), ('overall_skill_f1', '技能路由F1'), ('overall_execution_mode_accuracy', '执行模式准确率'), ('overall_task_success_rate', '任务成功率'), ('overfitting_score', '过拟合分数'), ] print() for key, label in metrics: c = cur.get(key, 0) b = base.get(key, 0) delta = c - b arrow = '↑' if delta > 0 else ('↓' if delta < 0 else '→') print(f' {label}: {b:.2%} → {c:.2%} {arrow} {delta:+.2%}') print() " elif [ -f "$CURRENT_REPORT" ]; then info "No baseline report found. Saving current report as baseline." cp "$CURRENT_REPORT" "$BASELINE_REPORT" info "Baseline saved to: $BASELINE_REPORT" else warn "No current report found. Run with --analyze first." fi fi exit $EXIT_CODE