329 lines
10 KiB
Bash
Executable File
329 lines
10 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# =============================================================================
|
|
# Fischer AgentKit — E2E Backtest Runner
|
|
# =============================================================================
|
|
#
|
|
# Usage:
|
|
# ./scripts/run_e2e.sh # Run all E2E tests
|
|
# ./scripts/run_e2e.sh --basic # Run basic function tests only
|
|
# ./scripts/run_e2e.sh --capability # Run agent capability tests only
|
|
# ./scripts/run_e2e.sh --cli # Run CLI tests only
|
|
# ./scripts/run_e2e.sh --api # Run API tests only
|
|
# ./scripts/run_e2e.sh --ws # Run WebSocket tests only
|
|
# ./scripts/run_e2e.sh --routing # Run routing intelligence tests
|
|
# ./scripts/run_e2e.sh --react # Run ReAct intelligence tests
|
|
# ./scripts/run_e2e.sh --team # Run team collaboration tests
|
|
# ./scripts/run_e2e.sh --report # Generate HTML report
|
|
# ./scripts/run_e2e.sh --analyze # Run capability tests + generate analysis report
|
|
# ./scripts/run_e2e.sh --direct # Run router direct backtest only (no HTTP)
|
|
# ./scripts/run_e2e.sh --alignment # Run alignment guard tests only
|
|
# ./scripts/run_e2e.sh --full # Run all: API + direct + alignment
|
|
# ./scripts/run_e2e.sh --baseline # Compare with last baseline report
|
|
#
|
|
# Environment:
|
|
# E2E_PORT - Server port (default: 18765)
|
|
# E2E_API_KEY - API key for auth (default: ak_live_e2e_test_key_...)
|
|
# SKIP_SERVER - Set to "1" to skip server startup (use existing)
|
|
# =============================================================================
|
|
|
|
set -euo pipefail
|
|
|
|
# ── Configuration ────────────────────────────────────────────────────────────
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
E2E_PORT="${E2E_PORT:-18765}"
|
|
E2E_API_KEY="${E2E_API_KEY:-ak_live_e2e_test_key_000000000000000000000000000000000000000000000000}"
|
|
REPORT_DIR="${PROJECT_ROOT}/test-results/e2e"
|
|
SKIP_SERVER="${SKIP_SERVER:-0}"
|
|
|
|
cd "$PROJECT_ROOT"
|
|
|
|
# ── Colors ───────────────────────────────────────────────────────────────────
|
|
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m' # No Color
|
|
|
|
# ── Helper Functions ─────────────────────────────────────────────────────────
|
|
|
|
info() { echo -e "${BLUE}[INFO]${NC} $*"; }
|
|
ok() { echo -e "${GREEN}[OK]${NC} $*"; }
|
|
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
|
fail() { echo -e "${RED}[FAIL]${NC} $*"; }
|
|
|
|
check_deps() {
|
|
local missing=0
|
|
for cmd in python3; do
|
|
if ! command -v "$cmd" &>/dev/null; then
|
|
fail "Missing dependency: $cmd"
|
|
missing=1
|
|
fi
|
|
done
|
|
if [ "$missing" -eq 1 ]; then
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
wait_for_server() {
|
|
local max_attempts=60
|
|
local attempt=0
|
|
info "Waiting for server on port $E2E_PORT..."
|
|
while [ $attempt -lt $max_attempts ]; do
|
|
if curl -s "http://127.0.0.1:$E2E_PORT/api/v1/health" &>/dev/null; then
|
|
ok "Server is ready on port $E2E_PORT"
|
|
return 0
|
|
fi
|
|
attempt=$((attempt + 1))
|
|
sleep 0.5
|
|
done
|
|
fail "Server failed to start within 30 seconds"
|
|
return 1
|
|
}
|
|
|
|
start_server() {
|
|
if [ "$SKIP_SERVER" = "1" ]; then
|
|
info "SKIP_SERVER=1, using existing server on port $E2E_PORT"
|
|
if curl -s "http://127.0.0.1:$E2E_PORT/api/v1/health" &>/dev/null; then
|
|
ok "Existing server is healthy"
|
|
return 0
|
|
else
|
|
fail "Existing server is not responding"
|
|
return 1
|
|
fi
|
|
fi
|
|
|
|
info "Starting AgentKit E2E server on port $E2E_PORT..."
|
|
export AGENTKIT_E2E_MODE=1
|
|
export AGENTKIT_WS_TIMEOUT=0
|
|
export AGENTKIT_API_KEY="$E2E_API_KEY"
|
|
|
|
# Start server in background
|
|
python3 -m agentkit.cli.main serve --host 127.0.0.1 --port "$E2E_PORT" &
|
|
SERVER_PID=$!
|
|
|
|
if wait_for_server; then
|
|
return 0
|
|
else
|
|
kill "$SERVER_PID" 2>/dev/null || true
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
stop_server() {
|
|
if [ "$SKIP_SERVER" = "1" ]; then
|
|
info "SKIP_SERVER=1, not stopping server"
|
|
return 0
|
|
fi
|
|
if [ -n "${SERVER_PID:-}" ]; then
|
|
info "Stopping E2E server (PID: $SERVER_PID)..."
|
|
kill "$SERVER_PID" 2>/dev/null || true
|
|
wait "$SERVER_PID" 2>/dev/null || true
|
|
ok "Server stopped"
|
|
fi
|
|
}
|
|
|
|
# ── Test Selection ───────────────────────────────────────────────────────────
|
|
|
|
PYTEST_ARGS=("--timeout=120" "-v" "--tb=short" "-s")
|
|
TEST_TARGET="tests/e2e/"
|
|
GENERATE_REPORT=0
|
|
ANALYZE=0
|
|
SKIP_SERVER_FLAG=0
|
|
BASELINE_COMPARE=0
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case $1 in
|
|
--basic)
|
|
PYTEST_ARGS+=("-m" "e2e_basic")
|
|
shift
|
|
;;
|
|
--capability)
|
|
PYTEST_ARGS+=("-m" "e2e_capability")
|
|
shift
|
|
;;
|
|
--cli)
|
|
TEST_TARGET="tests/e2e/test_basic_cli.py"
|
|
shift
|
|
;;
|
|
--api)
|
|
TEST_TARGET="tests/e2e/test_basic_api.py"
|
|
shift
|
|
;;
|
|
--ws)
|
|
TEST_TARGET="tests/e2e/test_basic_websocket.py"
|
|
shift
|
|
;;
|
|
--routing)
|
|
TEST_TARGET="tests/e2e/test_capability_routing.py"
|
|
shift
|
|
;;
|
|
--react)
|
|
TEST_TARGET="tests/e2e/test_capability_react.py"
|
|
shift
|
|
;;
|
|
--team)
|
|
TEST_TARGET="tests/e2e/test_capability_team.py"
|
|
shift
|
|
;;
|
|
--direct)
|
|
# Router direct backtest — no HTTP server needed
|
|
TEST_TARGET="tests/e2e/test_capability_router_direct.py"
|
|
SKIP_SERVER_FLAG=1
|
|
shift
|
|
;;
|
|
--alignment)
|
|
# Alignment guard tests — no HTTP server needed
|
|
TEST_TARGET="tests/e2e/test_capability_alignment.py"
|
|
SKIP_SERVER_FLAG=1
|
|
shift
|
|
;;
|
|
--full)
|
|
# Run all capability tests: API + direct + alignment
|
|
PYTEST_ARGS+=("-m" "e2e_capability")
|
|
shift
|
|
;;
|
|
--baseline)
|
|
BASELINE_COMPARE=1
|
|
shift
|
|
;;
|
|
--report)
|
|
GENERATE_REPORT=1
|
|
shift
|
|
;;
|
|
--analyze)
|
|
ANALYZE=1
|
|
PYTEST_ARGS+=("-m" "e2e_capability")
|
|
shift
|
|
;;
|
|
--fast)
|
|
PYTEST_ARGS+=("-x" "--timeout=30")
|
|
shift
|
|
;;
|
|
--help|-h)
|
|
echo "Usage: $0 [--basic|--capability|--cli|--api|--ws|--routing|--react|--team|--direct|--alignment|--full|--baseline|--report|--analyze|--fast]"
|
|
exit 0
|
|
;;
|
|
*)
|
|
PYTEST_ARGS+=("$1")
|
|
shift
|
|
;;
|
|
esac
|
|
done
|
|
|
|
if [ "$GENERATE_REPORT" -eq 1 ]; then
|
|
mkdir -p "$REPORT_DIR"
|
|
PYTEST_ARGS+=(
|
|
"--html=$REPORT_DIR/e2e_report.html"
|
|
"--self-contained-html"
|
|
"--junitxml=$REPORT_DIR/e2e_junit.xml"
|
|
)
|
|
fi
|
|
|
|
if [ "$ANALYZE" -eq 1 ]; then
|
|
info "Analysis mode: will generate capability report with recall/F1/overfitting analysis"
|
|
fi
|
|
|
|
# Override SKIP_SERVER when --direct or --alignment is used (no HTTP needed)
|
|
if [ "$SKIP_SERVER_FLAG" -eq 1 ]; then
|
|
SKIP_SERVER=1
|
|
fi
|
|
|
|
# ── Main ─────────────────────────────────────────────────────────────────────
|
|
|
|
info "Fischer AgentKit E2E Backtest Runner"
|
|
info "====================================="
|
|
info "Project: $PROJECT_ROOT"
|
|
info "Port: $E2E_PORT"
|
|
info "Target: $TEST_TARGET"
|
|
info ""
|
|
|
|
check_deps
|
|
|
|
# Trap to ensure server cleanup
|
|
trap stop_server EXIT INT TERM
|
|
|
|
if ! start_server; then
|
|
fail "Could not start E2E server"
|
|
exit 1
|
|
fi
|
|
|
|
info ""
|
|
info "Running E2E tests..."
|
|
info "===================="
|
|
info ""
|
|
|
|
export AGENTKIT_SERVER_URL="http://127.0.0.1:$E2E_PORT"
|
|
export AGENTKIT_API_KEY="$E2E_API_KEY"
|
|
|
|
EXIT_CODE=0
|
|
python3 -m pytest "$TEST_TARGET" "${PYTEST_ARGS[@]}" || EXIT_CODE=$?
|
|
|
|
echo ""
|
|
if [ $EXIT_CODE -eq 0 ]; then
|
|
ok "All E2E tests passed!"
|
|
else
|
|
fail "Some E2E tests failed (exit code: $EXIT_CODE)"
|
|
fi
|
|
|
|
if [ "$GENERATE_REPORT" -eq 1 ]; then
|
|
info "Report generated at: $REPORT_DIR/e2e_report.html"
|
|
fi
|
|
|
|
if [ "$ANALYZE" -eq 1 ]; then
|
|
CAPABILITY_REPORT="$PROJECT_ROOT/test-results/e2e/capability_report.txt"
|
|
if [ -f "$CAPABILITY_REPORT" ]; then
|
|
info "Capability analysis report:"
|
|
echo ""
|
|
cat "$CAPABILITY_REPORT"
|
|
else
|
|
warn "Capability report not found (may need capability tests to run first)"
|
|
fi
|
|
fi
|
|
|
|
if [ "$BASELINE_COMPARE" -eq 1 ]; then
|
|
CURRENT_REPORT="$PROJECT_ROOT/test-results/e2e/capability_report.json"
|
|
BASELINE_REPORT="$PROJECT_ROOT/test-results/e2e/baseline_capability_report.json"
|
|
if [ -f "$CURRENT_REPORT" ] && [ -f "$BASELINE_REPORT" ]; then
|
|
info "Baseline comparison:"
|
|
python3 -c "
|
|
import json, sys
|
|
|
|
def load_metrics(path):
|
|
with open(path) as f:
|
|
return json.load(f)
|
|
|
|
cur = load_metrics('$CURRENT_REPORT')
|
|
base = load_metrics('$BASELINE_REPORT')
|
|
|
|
metrics = [
|
|
('overall_skill_recall', '技能路由召回率'),
|
|
('overall_skill_precision', '技能路由精确率'),
|
|
('overall_skill_f1', '技能路由F1'),
|
|
('overall_execution_mode_accuracy', '执行模式准确率'),
|
|
('overall_task_success_rate', '任务成功率'),
|
|
('overfitting_score', '过拟合分数'),
|
|
]
|
|
|
|
print()
|
|
for key, label in metrics:
|
|
c = cur.get(key, 0)
|
|
b = base.get(key, 0)
|
|
delta = c - b
|
|
arrow = '↑' if delta > 0 else ('↓' if delta < 0 else '→')
|
|
print(f' {label}: {b:.2%} → {c:.2%} {arrow} {delta:+.2%}')
|
|
print()
|
|
"
|
|
elif [ -f "$CURRENT_REPORT" ]; then
|
|
info "No baseline report found. Saving current report as baseline."
|
|
cp "$CURRENT_REPORT" "$BASELINE_REPORT"
|
|
info "Baseline saved to: $BASELINE_REPORT"
|
|
else
|
|
warn "No current report found. Run with --analyze first."
|
|
fi
|
|
fi
|
|
|
|
exit $EXIT_CODE
|