fischer-agentkit/scripts/run_e2e.sh

#!/usr/bin/env bash
# =============================================================================
# Fischer AgentKit — E2E Backtest Runner
# =============================================================================
#
# Usage:
#   ./scripts/run_e2e.sh                  # Run all E2E tests
#   ./scripts/run_e2e.sh --basic          # Run basic function tests only
#   ./scripts/run_e2e.sh --capability     # Run agent capability tests only
#   ./scripts/run_e2e.sh --cli            # Run CLI tests only
#   ./scripts/run_e2e.sh --api            # Run API tests only
#   ./scripts/run_e2e.sh --ws             # Run WebSocket tests only
#   ./scripts/run_e2e.sh --routing        # Run routing intelligence tests
#   ./scripts/run_e2e.sh --react          # Run ReAct intelligence tests
#   ./scripts/run_e2e.sh --team           # Run team collaboration tests
#   ./scripts/run_e2e.sh --report         # Generate HTML report
#   ./scripts/run_e2e.sh --analyze        # Run capability tests + generate analysis report
#   ./scripts/run_e2e.sh --direct         # Run router direct backtest only (no HTTP)
#   ./scripts/run_e2e.sh --alignment      # Run alignment guard tests only
#   ./scripts/run_e2e.sh --full           # Run all: API + direct + alignment
#   ./scripts/run_e2e.sh --baseline       # Compare with last baseline report
#
# Environment:
#   E2E_PORT         - Server port (default: 18765)
#   E2E_API_KEY      - API key for auth (default: ak_live_e2e_test_key_...)
#   SKIP_SERVER      - Set to "1" to skip server startup (use existing)
# =============================================================================

set -euo pipefail

# ── Configuration ────────────────────────────────────────────────────────────

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
E2E_PORT="${E2E_PORT:-18765}"
E2E_API_KEY="${E2E_API_KEY:-ak_live_e2e_test_key_000000000000000000000000000000000000000000000000}"
REPORT_DIR="${PROJECT_ROOT}/test-results/e2e"
SKIP_SERVER="${SKIP_SERVER:-0}"

cd "$PROJECT_ROOT"

# ── Colors ───────────────────────────────────────────────────────────────────

RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color

# ── Helper Functions ─────────────────────────────────────────────────────────

info()  { echo -e "${BLUE}[INFO]${NC}  $*"; }
ok()    { echo -e "${GREEN}[OK]${NC}    $*"; }
warn()  { echo -e "${YELLOW}[WARN]${NC}  $*"; }
fail()  { echo -e "${RED}[FAIL]${NC}  $*"; }

check_deps() {
    local missing=0
    for cmd in python3; do
        if ! command -v "$cmd" &>/dev/null; then
            fail "Missing dependency: $cmd"
            missing=1
        fi
    done
    if [ "$missing" -eq 1 ]; then
        exit 1
    fi
}

wait_for_server() {
    local max_attempts=60
    local attempt=0
    info "Waiting for server on port $E2E_PORT..."
    while [ $attempt -lt $max_attempts ]; do
        if curl -s "http://127.0.0.1:$E2E_PORT/api/v1/health" &>/dev/null; then
            ok "Server is ready on port $E2E_PORT"
            return 0
        fi
        attempt=$((attempt + 1))
        sleep 0.5
    done
    fail "Server failed to start within 30 seconds"
    return 1
}

start_server() {
    if [ "$SKIP_SERVER" = "1" ]; then
        info "SKIP_SERVER=1, using existing server on port $E2E_PORT"
        if curl -s "http://127.0.0.1:$E2E_PORT/api/v1/health" &>/dev/null; then
            ok "Existing server is healthy"
            return 0
        else
            fail "Existing server is not responding"
            return 1
        fi
    fi

    info "Starting AgentKit E2E server on port $E2E_PORT..."
    export AGENTKIT_E2E_MODE=1
    export AGENTKIT_WS_TIMEOUT=0
    export AGENTKIT_API_KEY="$E2E_API_KEY"

    # Start server in background
    python3 -m agentkit.cli.main serve --host 127.0.0.1 --port "$E2E_PORT" &
    SERVER_PID=$!

    if wait_for_server; then
        return 0
    else
        kill "$SERVER_PID" 2>/dev/null || true
        return 1
    fi
}

stop_server() {
    if [ "$SKIP_SERVER" = "1" ]; then
        info "SKIP_SERVER=1, not stopping server"
        return 0
    fi
    if [ -n "${SERVER_PID:-}" ]; then
        info "Stopping E2E server (PID: $SERVER_PID)..."
        kill "$SERVER_PID" 2>/dev/null || true
        wait "$SERVER_PID" 2>/dev/null || true
        ok "Server stopped"
    fi
}

# ── Test Selection ───────────────────────────────────────────────────────────

PYTEST_ARGS=("--timeout=120" "-v" "--tb=short" "-s")
TEST_TARGET="tests/e2e/"
GENERATE_REPORT=0
ANALYZE=0
SKIP_SERVER_FLAG=0
BASELINE_COMPARE=0

while [[ $# -gt 0 ]]; do
    case $1 in
        --basic)
            PYTEST_ARGS+=("-m" "e2e_basic")
            shift
            ;;
        --capability)
            PYTEST_ARGS+=("-m" "e2e_capability")
            shift
            ;;
        --cli)
            TEST_TARGET="tests/e2e/test_basic_cli.py"
            shift
            ;;
        --api)
            TEST_TARGET="tests/e2e/test_basic_api.py"
            shift
            ;;
        --ws)
            TEST_TARGET="tests/e2e/test_basic_websocket.py"
            shift
            ;;
        --routing)
            TEST_TARGET="tests/e2e/test_capability_routing.py"
            shift
            ;;
        --react)
            TEST_TARGET="tests/e2e/test_capability_react.py"
            shift
            ;;
        --team)
            TEST_TARGET="tests/e2e/test_capability_team.py"
            shift
            ;;
        --direct)
            # Router direct backtest — no HTTP server needed
            TEST_TARGET="tests/e2e/test_capability_router_direct.py"
            SKIP_SERVER_FLAG=1
            shift
            ;;
        --alignment)
            # Alignment guard tests — no HTTP server needed
            TEST_TARGET="tests/e2e/test_capability_alignment.py"
            SKIP_SERVER_FLAG=1
            shift
            ;;
        --full)
            # Run all capability tests: API + direct + alignment
            PYTEST_ARGS+=("-m" "e2e_capability")
            shift
            ;;
        --baseline)
            BASELINE_COMPARE=1
            shift
            ;;
        --report)
            GENERATE_REPORT=1
            shift
            ;;
        --analyze)
            ANALYZE=1
            PYTEST_ARGS+=("-m" "e2e_capability")
            shift
            ;;
        --fast)
            PYTEST_ARGS+=("-x" "--timeout=30")
            shift
            ;;
        --help|-h)
            echo "Usage: $0 [--basic|--capability|--cli|--api|--ws|--routing|--react|--team|--direct|--alignment|--full|--baseline|--report|--analyze|--fast]"
            exit 0
            ;;
        *)
            PYTEST_ARGS+=("$1")
            shift
            ;;
    esac
done

if [ "$GENERATE_REPORT" -eq 1 ]; then
    mkdir -p "$REPORT_DIR"
    PYTEST_ARGS+=(
        "--html=$REPORT_DIR/e2e_report.html"
        "--self-contained-html"
        "--junitxml=$REPORT_DIR/e2e_junit.xml"
    )
fi

if [ "$ANALYZE" -eq 1 ]; then
    info "Analysis mode: will generate capability report with recall/F1/overfitting analysis"
fi

# Override SKIP_SERVER when --direct or --alignment is used (no HTTP needed)
if [ "$SKIP_SERVER_FLAG" -eq 1 ]; then
    SKIP_SERVER=1
fi

# ── Main ─────────────────────────────────────────────────────────────────────

info "Fischer AgentKit E2E Backtest Runner"
info "====================================="
info "Project:  $PROJECT_ROOT"
info "Port:     $E2E_PORT"
info "Target:   $TEST_TARGET"
info ""

check_deps

# Trap to ensure server cleanup
trap stop_server EXIT INT TERM

if ! start_server; then
    fail "Could not start E2E server"
    exit 1
fi

info ""
info "Running E2E tests..."
info "===================="
info ""

export AGENTKIT_SERVER_URL="http://127.0.0.1:$E2E_PORT"
export AGENTKIT_API_KEY="$E2E_API_KEY"

EXIT_CODE=0
python3 -m pytest "$TEST_TARGET" "${PYTEST_ARGS[@]}" || EXIT_CODE=$?

echo ""
if [ $EXIT_CODE -eq 0 ]; then
    ok "All E2E tests passed!"
else
    fail "Some E2E tests failed (exit code: $EXIT_CODE)"
fi

if [ "$GENERATE_REPORT" -eq 1 ]; then
    info "Report generated at: $REPORT_DIR/e2e_report.html"
fi

if [ "$ANALYZE" -eq 1 ]; then
    CAPABILITY_REPORT="$PROJECT_ROOT/test-results/e2e/capability_report.txt"
    if [ -f "$CAPABILITY_REPORT" ]; then
        info "Capability analysis report:"
        echo ""
        cat "$CAPABILITY_REPORT"
    else
        warn "Capability report not found (may need capability tests to run first)"
    fi
fi

if [ "$BASELINE_COMPARE" -eq 1 ]; then
    CURRENT_REPORT="$PROJECT_ROOT/test-results/e2e/capability_report.json"
    BASELINE_REPORT="$PROJECT_ROOT/test-results/e2e/baseline_capability_report.json"
    if [ -f "$CURRENT_REPORT" ] && [ -f "$BASELINE_REPORT" ]; then
        info "Baseline comparison:"
        python3 -c "
import json, sys

def load_metrics(path):
    with open(path) as f:
        return json.load(f)

cur = load_metrics('$CURRENT_REPORT')
base = load_metrics('$BASELINE_REPORT')

metrics = [
    ('overall_skill_recall', '技能路由召回率'),
    ('overall_skill_precision', '技能路由精确率'),
    ('overall_skill_f1', '技能路由F1'),
    ('overall_execution_mode_accuracy', '执行模式准确率'),
    ('overall_task_success_rate', '任务成功率'),
    ('overfitting_score', '过拟合分数'),
]

print()
for key, label in metrics:
    c = cur.get(key, 0)
    b = base.get(key, 0)
    delta = c - b
    arrow = '↑' if delta > 0 else ('↓' if delta < 0 else '→')
    print(f'  {label}: {b:.2%} → {c:.2%}  {arrow} {delta:+.2%}')
print()
"
    elif [ -f "$CURRENT_REPORT" ]; then
        info "No baseline report found. Saving current report as baseline."
        cp "$CURRENT_REPORT" "$BASELINE_REPORT"
        info "Baseline saved to: $BASELINE_REPORT"
    else
        warn "No current report found. Run with --analyze first."
    fi
fi

exit $EXIT_CODE