diff --git a/src/agentkit/cli/benchmark.py b/src/agentkit/cli/benchmark.py
new file mode 100644
index 0000000..45e7dd7
--- /dev/null
+++ b/src/agentkit/cli/benchmark.py
@@ -0,0 +1,1369 @@
+"""Benchmark CLI command — run capability backtests and generate reports.
+
+Tests core AgentKit components directly (no pytest subprocess, no real LLM):
+- preprocessing: RequestPreprocessor routing accuracy
+- overfitting: routing consistency across paraphrases
+- efficiency: component execution timing
+- tool_search: ToolSearchIndex BM25 relevance
+- event_model: SubmissionQueue / EventQueue lifecycle
+- spec_management: SpecManager CRUD operations
+- verification: VerificationLoop execute/retry behavior
+
+Usage:
+    agentkit benchmark                          # run all dimensions
+    agentkit benchmark --dimension preprocessing
+    agentkit benchmark --report                 # JSON + TXT report
+    agentkit benchmark --report --format html   # + HTML report
+    agentkit benchmark --output-dir ./my-results
+    agentkit benchmark --fast                   # core cases only
+    agentkit benchmark --verbose                # detailed output
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import time
+from dataclasses import asdict, dataclass, field
+from datetime import datetime, timezone
+from enum import Enum
+from pathlib import Path
+from typing import Any
+
+import typer
+from rich.console import Console
+from rich.panel import Panel
+from rich.progress import (
+    BarColumn,
+    Progress,
+    SpinnerColumn,
+    TaskProgressColumn,
+    TextColumn,
+)
+from rich.table import Table
+
+console = Console()
+
+_DEFAULT_OUTPUT_DIR = "test-results/benchmark"
+
+
+class BenchmarkDimension(str, Enum):
+    """Benchmark test dimensions."""
+
+    PREPROCESSING = "preprocessing"
+    OVERFITTING = "overfitting"
+    EFFICIENCY = "efficiency"
+    TOOL_SEARCH = "tool_search"
+    EVENT_MODEL = "event_model"
+    SPEC_MANAGEMENT = "spec_management"
+    VERIFICATION = "verification"
+    ALL = "all"
+
+
+# ---------------------------------------------------------------------------
+# Result data structures
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class TestCaseResult:
+    """Single test case result."""
+
+    case_id: str
+    passed: bool
+    expected: str
+    actual: str
+    duration_ms: float
+    detail: str = ""
+
+
+@dataclass
+class DimensionResult:
+    """Aggregated result for one dimension."""
+
+    dimension: str
+    total: int = 0
+    passed: int = 0
+    failed: int = 0
+    details: list[TestCaseResult] = field(default_factory=list)
+
+    @property
+    def score(self) -> float:
+        return self.passed / self.total if self.total > 0 else 0.0
+
+    def add(self, case: TestCaseResult) -> None:
+        self.total += 1
+        if case.passed:
+            self.passed += 1
+        else:
+            self.failed += 1
+        self.details.append(case)
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "score": round(self.score, 4),
+            "total": self.total,
+            "passed": self.passed,
+            "failed": self.failed,
+            "details": [asdict(d) for d in self.details],
+        }
+
+
+# ---------------------------------------------------------------------------
+# Helpers — mock objects
+# ---------------------------------------------------------------------------
+
+
+def _make_mock_skill_registry():
+    """Build a SkillRegistry with a couple of mock skills for preprocessing tests."""
+    from agentkit.skills.base import Skill, SkillConfig
+    from agentkit.skills.registry import SkillRegistry
+
+    registry = SkillRegistry()
+
+    react_config = SkillConfig(
+        name="react_agent",
+        agent_type="react",
+        description="General ReAct agent",
+        execution_mode="react",
+        prompt={"identity": "You are a helpful assistant."},
+    )
+    registry.register(Skill(react_config))
+
+    direct_config = SkillConfig(
+        name="chat_only",
+        agent_type="direct",
+        description="Direct chat agent",
+        execution_mode="direct",
+        prompt={"identity": "You are a chat bot."},
+    )
+    registry.register(Skill(direct_config))
+
+    return registry
+
+
+def _make_mock_tools():
+    """Build a list of mock Tool instances for tool_search tests."""
+    from agentkit.tools.base import Tool
+
+    class _FakeTool(Tool):
+        def __init__(
+            self,
+            name: str,
+            description: str,
+            input_schema: dict[str, Any] | None = None,
+            tags: list[str] | None = None,
+        ):
+            super().__init__(
+                name=name,
+                description=description,
+                input_schema=input_schema,
+                tags=tags or [],
+            )
+
+        async def execute(self, **kwargs) -> dict:
+            return {"status": "ok"}
+
+    return [
+        _FakeTool(
+            name="read_file",
+            description="Read the contents of a file from the filesystem.",
+            input_schema={
+                "type": "object",
+                "properties": {"path": {"type": "string", "description": "file path to read"}},
+                "required": ["path"],
+            },
+            tags=["io", "file"],
+        ),
+        _FakeTool(
+            name="write_file",
+            description="Write content to a file on the filesystem.",
+            input_schema={
+                "type": "object",
+                "properties": {
+                    "path": {"type": "string", "description": "file path to write"},
+                    "content": {"type": "string", "description": "content to write"},
+                },
+                "required": ["path", "content"],
+            },
+            tags=["io", "file"],
+        ),
+        _FakeTool(
+            name="web_search",
+            description="Search the web for information using a search engine.",
+            input_schema={
+                "type": "object",
+                "properties": {"query": {"type": "string", "description": "search query"}},
+                "required": ["query"],
+            },
+            tags=["web", "search"],
+        ),
+        _FakeTool(
+            name="shell_exec",
+            description="Execute a shell command and return the output.",
+            input_schema={
+                "type": "object",
+                "properties": {"command": {"type": "string", "description": "shell command"}},
+                "required": ["command"],
+            },
+            tags=["system", "shell"],
+        ),
+        _FakeTool(
+            name="http_request",
+            description="Send an HTTP request to a URL and return the response.",
+            input_schema={
+                "type": "object",
+                "properties": {
+                    "url": {"type": "string", "description": "target URL"},
+                    "method": {"type": "string", "description": "HTTP method"},
+                },
+                "required": ["url"],
+            },
+            tags=["web", "http"],
+        ),
+    ]
+
+
+# ---------------------------------------------------------------------------
+# Dimension test runners
+# ---------------------------------------------------------------------------
+
+
+async def _run_preprocessing(fast: bool, verbose: bool) -> DimensionResult:
+    """Test RequestPreprocessor routing accuracy."""
+    from agentkit.chat.request_preprocessor import RequestPreprocessor
+
+    registry = _make_mock_skill_registry()
+    preprocessor = RequestPreprocessor(skill_registry=registry)
+
+    cases: list[dict[str, str]] = [
+        {"id": "greeting_cn", "input": "你好", "expected": "direct_chat"},
+        {"id": "greeting_en", "input": "hello", "expected": "direct_chat"},
+        {"id": "chitchat_thanks", "input": "谢谢", "expected": "direct_chat"},
+        {"id": "identity_who", "input": "你是谁", "expected": "direct_chat"},
+        {"id": "colloquial_ip_1", "input": "查下ip", "expected": "react"},
+        {"id": "colloquial_ip_2", "input": "查看当前ip", "expected": "react"},
+        {"id": "tool_search", "input": "搜索golang教程", "expected": "react"},
+        {"id": "tool_shell", "input": "执行ls命令", "expected": "react"},
+        {"id": "translation", "input": "翻译hello为中文", "expected": "react"},
+        {"id": "knowledge", "input": "什么是机器学习", "expected": "react"},
+        {"id": "skill_prefix_react", "input": "@skill:react_agent 查看ip", "expected": "skill_react"},
+        {"id": "skill_prefix_direct", "input": "@skill:chat_only 你好", "expected": "skill_react"},
+        {"id": "skill_not_found", "input": "@skill:nonexistent 做点什么", "expected": "react"},
+        {"id": "complex_analysis", "input": "帮我分析一下这个数据并生成报告", "expected": "react"},
+        {"id": "empty_fallback", "input": "随便聊聊", "expected": "react"},
+    ]
+
+    if fast:
+        # Core cases only: greetings, tool queries, skill prefix
+        fast_ids = {
+            "greeting_cn",
+            "colloquial_ip_1",
+            "tool_search",
+            "skill_prefix_react",
+            "skill_not_found",
+        }
+        cases = [c for c in cases if c["id"] in fast_ids]
+
+    result = DimensionResult(dimension="preprocessing")
+
+    for case in cases:
+        start = time.perf_counter()
+        routing = await preprocessor.preprocess(content=case["input"])
+        elapsed_ms = (time.perf_counter() - start) * 1000
+
+        actual = routing.execution_mode.value
+        passed = actual == case["expected"]
+
+        result.add(
+            TestCaseResult(
+                case_id=case["id"],
+                passed=passed,
+                expected=case["expected"],
+                actual=actual,
+                duration_ms=round(elapsed_ms, 2),
+                detail=f"input={case['input']!r} method={routing.match_method}",
+            )
+        )
+
+        if verbose and not passed:
+            console.print(
+                f"  [red]✗[/red] {case['id']}: expected={case['expected']} "
+                f"actual={actual} ({routing.match_method})"
+            )
+        elif verbose:
+            console.print(f"  [green]✓[/green] {case['id']}: {actual} ({elapsed_ms:.1f}ms)")
+
+    return result
+
+
+async def _run_overfitting(fast: bool, verbose: bool) -> DimensionResult:
+    """Test routing consistency across paraphrases (overfitting detection).
+
+    Same intent expressed differently should route to the same execution mode.
+    """
+    from agentkit.chat.request_preprocessor import RequestPreprocessor
+
+    registry = _make_mock_skill_registry()
+    preprocessor = RequestPreprocessor(skill_registry=registry)
+
+    paraphrase_groups: list[dict[str, Any]] = [
+        {
+            "id": "ip_check_variants",
+            "paraphrases": ["查下ip", "查看当前ip", "获取ip地址", "看下ip", "帮我查一下ip"],
+            "expected": "react",
+        },
+        {
+            "id": "search_variants",
+            "paraphrases": ["搜索golang教程", "搜一下golang教程", "找下golang学习资料"],
+            "expected": "react",
+        },
+        {
+            "id": "greeting_variants",
+            "paraphrases": ["你好", "hello", "hi", "嗨", "哈喽"],
+            "expected": "direct_chat",
+        },
+    ]
+
+    if fast:
+        paraphrase_groups = paraphrase_groups[:2]
+
+    result = DimensionResult(dimension="overfitting")
+
+    for group in paraphrase_groups:
+        modes: list[str] = []
+        for text in group["paraphrases"]:
+            routing = await preprocessor.preprocess(content=text)
+            modes.append(routing.execution_mode.value)
+
+        # All paraphrases should produce the same mode
+        unique_modes = set(modes)
+        consistent = len(unique_modes) == 1
+        expected_mode = group["expected"]
+        correct = consistent and modes[0] == expected_mode if modes else False
+
+        result.add(
+            TestCaseResult(
+                case_id=group["id"],
+                passed=correct,
+                expected=expected_mode,
+                actual=",".join(modes),
+                duration_ms=0.0,
+                detail=f"paraphrases={len(group['paraphrases'])} consistent={consistent}",
+            )
+        )
+
+        if verbose:
+            status = "[green]✓[/green]" if correct else "[red]✗[/red]"
+            console.print(f"  {status} {group['id']}: modes={modes}")
+
+    return result
+
+
+async def _run_efficiency(fast: bool, verbose: bool) -> DimensionResult:
+    """Test component execution efficiency (timing bounds)."""
+    from agentkit.chat.request_preprocessor import RequestPreprocessor
+    from agentkit.tools.search import ToolSearchIndex
+
+    registry = _make_mock_skill_registry()
+    preprocessor = RequestPreprocessor(skill_registry=registry)
+    tools = _make_mock_tools()
+    search_index = ToolSearchIndex(tools)
+
+    # Thresholds in milliseconds (generous — these are pure-Python ops)
+    thresholds: list[dict[str, Any]] = [
+        {
+            "id": "preprocess_greeting",
+            "func": lambda: preprocessor.preprocess(content="你好"),
+            "max_ms": 50.0,
+            "iterations": 100,
+        },
+        {
+            "id": "preprocess_react",
+            "func": lambda: preprocessor.preprocess(content="查下ip"),
+            "max_ms": 50.0,
+            "iterations": 100,
+        },
+        {
+            "id": "preprocess_skill_prefix",
+            "func": lambda: preprocessor.preprocess(content="@skill:react_agent test"),
+            "max_ms": 50.0,
+            "iterations": 100,
+        },
+        {
+            "id": "tool_search_query",
+            "func": None,  # handled specially (sync)
+            "max_ms": 10.0,
+            "iterations": 200,
+        },
+        {
+            "id": "tool_search_empty",
+            "func": None,
+            "max_ms": 5.0,
+            "iterations": 200,
+        },
+    ]
+
+    if fast:
+        thresholds = [t for t in thresholds if t["id"] in {
+            "preprocess_greeting", "tool_search_query"
+        }]
+
+    result = DimensionResult(dimension="efficiency")
+
+    for spec in thresholds:
+        start = time.perf_counter()
+        if spec["func"] is not None:
+            for _ in range(spec["iterations"]):
+                await spec["func"]()
+        else:
+            query = "read file" if "query" in spec["id"] else ""
+            for _ in range(spec["iterations"]):
+                search_index.search(query, top_k=5)
+        total_ms = (time.perf_counter() - start) * 1000
+        avg_ms = total_ms / spec["iterations"]
+
+        passed = avg_ms <= spec["max_ms"]
+        result.add(
+            TestCaseResult(
+                case_id=spec["id"],
+                passed=passed,
+                expected=f"<= {spec['max_ms']}ms/call",
+                actual=f"{avg_ms:.3f}ms/call",
+                duration_ms=round(total_ms, 2),
+                detail=f"iterations={spec['iterations']}",
+            )
+        )
+
+        if verbose:
+            status = "[green]✓[/green]" if passed else "[red]✗[/red]"
+            console.print(
+                f"  {status} {spec['id']}: {avg_ms:.3f}ms/call "
+                f"(threshold {spec['max_ms']}ms)"
+            )
+
+    return result
+
+
+async def _run_tool_search(fast: bool, verbose: bool) -> DimensionResult:
+    """Test ToolSearchIndex BM25 relevance ranking."""
+    from agentkit.tools.search import ToolSearchIndex
+
+    tools = _make_mock_tools()
+    index = ToolSearchIndex(tools)
+
+    cases: list[dict[str, Any]] = [
+        {"id": "read_file_query", "query": "read file", "expected_top": "read_file"},
+        {"id": "write_file_query", "query": "write file content", "expected_top": "write_file"},
+        {"id": "web_search_query", "query": "search web information", "expected_top": "web_search"},
+        {"id": "shell_exec_query", "query": "execute shell command", "expected_top": "shell_exec"},
+        {"id": "http_request_query", "query": "send http request url", "expected_top": "http_request"},
+        {"id": "file_tag_query", "query": "io file", "expected_top": "read_file"},
+        {"id": "empty_query", "query": "", "expected_top": "__none__"},
+        {"id": "no_match_query", "query": "zzzznonexistent", "expected_top": "__none__"},
+        {"id": "top_k_limit", "query": "file", "expected_top": "read_file", "top_k": 1},
+        {"id": "multi_token_query", "query": "search query engine", "expected_top": "web_search"},
+    ]
+
+    if fast:
+        fast_ids = {"read_file_query", "web_search_query", "empty_query", "top_k_limit"}
+        cases = [c for c in cases if c["id"] in fast_ids]
+
+    result = DimensionResult(dimension="tool_search")
+
+    for case in cases:
+        start = time.perf_counter()
+        top_k = case.get("top_k", 5)
+        found = index.search(case["query"], top_k=top_k)
+        elapsed_ms = (time.perf_counter() - start) * 1000
+
+        if case["expected_top"] == "__none__":
+            passed = len(found) == 0
+            actual = "[]" if passed else found[0].name
+        else:
+            actual = found[0].name if found else "__empty__"
+            passed = actual == case["expected_top"]
+
+        result.add(
+            TestCaseResult(
+                case_id=case["id"],
+                passed=passed,
+                expected=case["expected_top"],
+                actual=actual,
+                duration_ms=round(elapsed_ms, 2),
+                detail=f"query={case['query']!r} top_k={top_k} results={len(found)}",
+            )
+        )
+
+        if verbose:
+            status = "[green]✓[/green]" if passed else "[red]✗[/red]"
+            console.print(f"  {status} {case['id']}: top={actual} ({elapsed_ms:.2f}ms)")
+
+    return result
+
+
+async def _run_event_model(fast: bool, verbose: bool) -> DimensionResult:
+    """Test SubmissionQueue / EventQueue lifecycle."""
+    from agentkit.core.event_queue import EventQueue, SubmissionQueue
+    from agentkit.core.protocol import Event
+
+    result = DimensionResult(dimension="event_model")
+
+    # --- SubmissionQueue tests ---
+    sq = SubmissionQueue()
+
+    # Test 1: submit and drain
+    start = time.perf_counter()
+    task_id = await sq.submit("hello", "session-1")
+    drained: list[str] = []
+    async for submission in sq.drain():
+        drained.append(submission.content)
+        break  # only drain one to avoid blocking
+    elapsed_ms = (time.perf_counter() - start) * 1000
+    passed = task_id != "" and drained == ["hello"]
+    result.add(
+        TestCaseResult(
+            case_id="sq_submit_drain",
+            passed=passed,
+            expected="task_id + drained=['hello']",
+            actual=f"task_id={task_id[:8]}... drained={drained}",
+            duration_ms=round(elapsed_ms, 2),
+        )
+    )
+    if verbose:
+        console.print(f"  {'[green]✓[/green]' if passed else '[red]✗[/red]'} sq_submit_drain")
+
+    # Test 2: cancel
+    start = time.perf_counter()
+    cancel_id = await sq.submit("to-cancel", "session-2")
+    cancelled = await sq.cancel(cancel_id)
+    elapsed_ms = (time.perf_counter() - start) * 1000
+    passed = cancelled and sq._submissions[cancel_id].cancelled
+    result.add(
+        TestCaseResult(
+            case_id="sq_cancel",
+            passed=passed,
+            expected="cancelled=True",
+            actual=f"cancelled={cancelled}",
+            duration_ms=round(elapsed_ms, 2),
+        )
+    )
+    if verbose:
+        console.print(f"  {'[green]✓[/green]' if passed else '[red]✗[/red]'} sq_cancel")
+
+    # Test 3: close blocks new submissions
+    start = time.perf_counter()
+    sq2 = SubmissionQueue()
+    sq2.close()
+    raised = False
+    try:
+        await sq2.submit("after-close", "session-3")
+    except RuntimeError:
+        raised = True
+    elapsed_ms = (time.perf_counter() - start) * 1000
+    passed = raised and sq2.is_closed
+    result.add(
+        TestCaseResult(
+            case_id="sq_close_blocks",
+            passed=passed,
+            expected="RuntimeError on submit after close",
+            actual=f"raised={raised} closed={sq2.is_closed}",
+            duration_ms=round(elapsed_ms, 2),
+        )
+    )
+    if verbose:
+        console.print(f"  {'[green]✓[/green]' if passed else '[red]✗[/red]'} sq_close_blocks")
+
+    # --- EventQueue tests ---
+    eq = EventQueue(buffer_size=10)
+
+    # Test 4: emit and subscribe with replay
+    start = time.perf_counter()
+    test_event = Event(
+        event_type="test_event",
+        task_id="task-1",
+        session_id="session-1",
+        data={"msg": "hello"},
+        timestamp=datetime.now(timezone.utc).isoformat(),
+    )
+    await eq.emit(test_event)
+
+    received: list[Event] = []
+    # Subscribe and collect one event (replay)
+    async for event in eq.subscribe():
+        received.append(event)
+        break
+    elapsed_ms = (time.perf_counter() - start) * 1000
+    passed = len(received) == 1 and received[0].event_type == "test_event"
+    result.add(
+        TestCaseResult(
+            case_id="eq_emit_subscribe_replay",
+            passed=passed,
+            expected="1 event replayed",
+            actual=f"{len(received)} events",
+            duration_ms=round(elapsed_ms, 2),
+        )
+    )
+    if verbose:
+        console.print(f"  {'[green]✓[/green]' if passed else '[red]✗[/red]'} eq_emit_subscribe_replay")
+
+    # Test 5: close sends sentinel
+    start = time.perf_counter()
+    eq2 = EventQueue()
+
+    async def _consume_all() -> list[Event]:
+        events: list[Event] = []
+        async for ev in eq2.subscribe():
+            events.append(ev)
+        return events
+
+    # Start consumer, emit, then close
+    consumer_task = asyncio.create_task(_consume_all())
+    await asyncio.sleep(0.01)  # let subscriber register
+    await eq2.emit(test_event)
+    await asyncio.sleep(0.01)
+    eq2.close()
+    events = await asyncio.wait_for(consumer_task, timeout=2.0)
+    elapsed_ms = (time.perf_counter() - start) * 1000
+    passed = len(events) >= 1 and eq2.is_closed
+    result.add(
+        TestCaseResult(
+            case_id="eq_close_sentinel",
+            passed=passed,
+            expected="subscriber exits on close",
+            actual=f"{len(events)} events, closed={eq2.is_closed}",
+            duration_ms=round(elapsed_ms, 2),
+        )
+    )
+    if verbose:
+        console.print(f"  {'[green]✓[/green]' if passed else '[red]✗[/red]'} eq_close_sentinel")
+
+    # Test 6: subscriber count
+    start = time.perf_counter()
+    eq3 = EventQueue()
+    initial_count = eq3.subscriber_count
+    elapsed_ms = (time.perf_counter() - start) * 1000
+    passed = initial_count == 0
+    result.add(
+        TestCaseResult(
+            case_id="eq_subscriber_count",
+            passed=passed,
+            expected="0 subscribers initially",
+            actual=f"{initial_count} subscribers",
+            duration_ms=round(elapsed_ms, 2),
+        )
+    )
+    if verbose:
+        console.print(f"  {'[green]✓[/green]' if passed else '[red]✗[/red]'} eq_subscriber_count")
+
+    if fast:
+        # Keep only core cases in fast mode
+        core_ids = {"sq_submit_drain", "eq_emit_subscribe_replay", "eq_close_sentinel"}
+        result.details = [d for d in result.details if d.case_id in core_ids]
+        result.total = len(result.details)
+        result.passed = sum(1 for d in result.details if d.passed)
+        result.failed = result.total - result.passed
+
+    return result
+
+
+async def _run_spec_management(fast: bool, verbose: bool, tmp_dir: Path) -> DimensionResult:
+    """Test SpecManager CRUD operations."""
+    from agentkit.core.spec_manager import Spec, SpecManager, SpecStep
+
+    specs_dir = str(tmp_dir / "specs")
+    manager = SpecManager(specs_dir=specs_dir)
+
+    result = DimensionResult(dimension="spec_management")
+
+    # Test 1: create
+    start = time.perf_counter()
+    spec = Spec(
+        spec_id="spec-001",
+        goal="Test goal",
+        steps=[
+            SpecStep(step_id="s1", name="step1", description="first step"),
+            SpecStep(step_id="s2", name="step2", description="second step", dependencies=["s1"]),
+        ],
+    )
+    path = manager.create(spec)
+    elapsed_ms = (time.perf_counter() - start) * 1000
+    passed = path.exists()
+    result.add(
+        TestCaseResult(
+            case_id="spec_create",
+            passed=passed,
+            expected="file exists on disk",
+            actual=f"exists={path.exists()}",
+            duration_ms=round(elapsed_ms, 2),
+        )
+    )
+    if verbose:
+        console.print(f"  {'[green]✓[/green]' if passed else '[red]✗[/red]'} spec_create")
+
+    # Test 2: get
+    start = time.perf_counter()
+    loaded = manager.get("spec-001")
+    elapsed_ms = (time.perf_counter() - start) * 1000
+    passed = loaded is not None and loaded.spec_id == "spec-001" and len(loaded.steps) == 2
+    result.add(
+        TestCaseResult(
+            case_id="spec_get",
+            passed=passed,
+            expected="spec with 2 steps",
+            actual=f"steps={len(loaded.steps) if loaded else 0}",
+            duration_ms=round(elapsed_ms, 2),
+        )
+    )
+    if verbose:
+        console.print(f"  {'[green]✓[/green]' if passed else '[red]✗[/red]'} spec_get")
+
+    # Test 3: update
+    start = time.perf_counter()
+    updated = manager.update("spec-001", goal="Updated goal")
+    elapsed_ms = (time.perf_counter() - start) * 1000
+    passed = updated is not None and updated.goal == "Updated goal"
+    result.add(
+        TestCaseResult(
+            case_id="spec_update",
+            passed=passed,
+            expected="goal='Updated goal'",
+            actual=f"goal={updated.goal if updated else None}",
+            duration_ms=round(elapsed_ms, 2),
+        )
+    )
+    if verbose:
+        console.print(f"  {'[green]✓[/green]' if passed else '[red]✗[/red]'} spec_update")
+
+    # Test 4: confirm
+    start = time.perf_counter()
+    confirmed = manager.confirm("spec-001")
+    elapsed_ms = (time.perf_counter() - start) * 1000
+    passed = (
+        confirmed is not None
+        and confirmed.status == "confirmed"
+        and confirmed.confirmed_at is not None
+        and all(s.status == "confirmed" for s in confirmed.steps)
+    )
+    result.add(
+        TestCaseResult(
+            case_id="spec_confirm",
+            passed=passed,
+            expected="status=confirmed, all steps confirmed",
+            actual=f"status={confirmed.status if confirmed else None}",
+            duration_ms=round(elapsed_ms, 2),
+        )
+    )
+    if verbose:
+        console.print(f"  {'[green]✓[/green]' if passed else '[red]✗[/red]'} spec_confirm")
+
+    # Test 5: list
+    start = time.perf_counter()
+    # Create a second spec for listing
+    spec2 = Spec(spec_id="spec-002", goal="Second goal")
+    manager.create(spec2)
+    specs = manager.list_specs()
+    elapsed_ms = (time.perf_counter() - start) * 1000
+    passed = len(specs) == 2
+    result.add(
+        TestCaseResult(
+            case_id="spec_list",
+            passed=passed,
+            expected="2 specs",
+            actual=f"{len(specs)} specs",
+            duration_ms=round(elapsed_ms, 2),
+        )
+    )
+    if verbose:
+        console.print(f"  {'[green]✓[/green]' if passed else '[red]✗[/red]'} spec_list")
+
+    # Test 6: delete
+    start = time.perf_counter()
+    deleted = manager.delete("spec-002")
+    remaining = manager.list_specs()
+    elapsed_ms = (time.perf_counter() - start) * 1000
+    passed = deleted and len(remaining) == 1
+    result.add(
+        TestCaseResult(
+            case_id="spec_delete",
+            passed=passed,
+            expected="deleted, 1 remaining",
+            actual=f"deleted={deleted}, remaining={len(remaining)}",
+            duration_ms=round(elapsed_ms, 2),
+        )
+    )
+    if verbose:
+        console.print(f"  {'[green]✓[/green]' if passed else '[red]✗[/red]'} spec_delete")
+
+    # Test 7: get nonexistent
+    start = time.perf_counter()
+    missing = manager.get("nonexistent")
+    elapsed_ms = (time.perf_counter() - start) * 1000
+    passed = missing is None
+    result.add(
+        TestCaseResult(
+            case_id="spec_get_missing",
+            passed=passed,
+            expected="None",
+            actual=f"{missing}",
+            duration_ms=round(elapsed_ms, 2),
+        )
+    )
+    if verbose:
+        console.print(f"  {'[green]✓[/green]' if passed else '[red]✗[/red]'} spec_get_missing")
+
+    if fast:
+        core_ids = {"spec_create", "spec_get", "spec_confirm", "spec_delete"}
+        result.details = [d for d in result.details if d.case_id in core_ids]
+        result.total = len(result.details)
+        result.passed = sum(1 for d in result.details if d.passed)
+        result.failed = result.total - result.passed
+
+    return result
+
+
+async def _run_verification(fast: bool, verbose: bool, tmp_dir: Path) -> DimensionResult:
+    """Test VerificationLoop execute/retry behavior."""
+    from agentkit.core.verification_loop import VerificationLoop
+
+    result = DimensionResult(dimension="verification")
+
+    # Test 1: passing command
+    start = time.perf_counter()
+    loop_pass = VerificationLoop(
+        commands=["true"],
+        max_retries=0,
+        working_dir=str(tmp_dir),
+        timeout=5.0,
+    )
+    res = await loop_pass.verify()
+    elapsed_ms = (time.perf_counter() - start) * 1000
+    passed = res.passed and res.attempts == 1
+    result.add(
+        TestCaseResult(
+            case_id="verify_pass",
+            passed=passed,
+            expected="passed=True, attempts=1",
+            actual=f"passed={res.passed}, attempts={res.attempts}",
+            duration_ms=round(elapsed_ms, 2),
+        )
+    )
+    if verbose:
+        console.print(f"  {'[green]✓[/green]' if passed else '[red]✗[/red]'} verify_pass")
+
+    # Test 2: failing command
+    start = time.perf_counter()
+    loop_fail = VerificationLoop(
+        commands=["false"],
+        max_retries=0,
+        working_dir=str(tmp_dir),
+        timeout=5.0,
+    )
+    res = await loop_fail.verify()
+    elapsed_ms = (time.perf_counter() - start) * 1000
+    passed = not res.passed and len(res.errors) > 0
+    result.add(
+        TestCaseResult(
+            case_id="verify_fail",
+            passed=passed,
+            expected="passed=False, has errors",
+            actual=f"passed={res.passed}, errors={len(res.errors)}",
+            duration_ms=round(elapsed_ms, 2),
+        )
+    )
+    if verbose:
+        console.print(f"  {'[green]✓[/green]' if passed else '[red]✗[/red]'} verify_fail")
+
+    # Test 3: retry with fix callback
+    start = time.perf_counter()
+    call_count = 0
+
+    async def _fix_callback(errors: list[str], output: str) -> None:
+        nonlocal call_count
+        call_count += 1
+
+    # Use a command that always fails to test retry logic
+    loop_retry = VerificationLoop(
+        commands=["false"],
+        max_retries=2,
+        working_dir=str(tmp_dir),
+        timeout=5.0,
+    )
+    res = await loop_retry.verify_and_retry(fix_callback=_fix_callback)
+    elapsed_ms = (time.perf_counter() - start) * 1000
+    passed = not res.passed and res.attempts == 3 and call_count == 2
+    result.add(
+        TestCaseResult(
+            case_id="verify_retry",
+            passed=passed,
+            expected="attempts=3, fix_callback called 2x",
+            actual=f"attempts={res.attempts}, callbacks={call_count}",
+            duration_ms=round(elapsed_ms, 2),
+        )
+    )
+    if verbose:
+        console.print(f"  {'[green]✓[/green]' if passed else '[red]✗[/red]'} verify_retry")
+
+    # Test 4: timeout
+    start = time.perf_counter()
+    loop_timeout = VerificationLoop(
+        commands=["sleep 10"],
+        max_retries=0,
+        working_dir=str(tmp_dir),
+        timeout=0.5,
+    )
+    res = await loop_timeout.verify()
+    elapsed_ms = (time.perf_counter() - start) * 1000
+    passed = not res.passed and any("timed out" in e.lower() for e in res.errors)
+    result.add(
+        TestCaseResult(
+            case_id="verify_timeout",
+            passed=passed,
+            expected="timeout error",
+            actual=f"passed={res.passed}, errors={len(res.errors)}",
+            duration_ms=round(elapsed_ms, 2),
+        )
+    )
+    if verbose:
+        console.print(f"  {'[green]✓[/green]' if passed else '[red]✗[/red]'} verify_timeout")
+
+    # Test 5: multiple commands (one passes, one fails)
+    start = time.perf_counter()
+    loop_multi = VerificationLoop(
+        commands=["true", "false"],
+        max_retries=0,
+        working_dir=str(tmp_dir),
+        timeout=5.0,
+    )
+    res = await loop_multi.verify()
+    elapsed_ms = (time.perf_counter() - start) * 1000
+    passed = not res.passed and "false" in res.test_output
+    result.add(
+        TestCaseResult(
+            case_id="verify_multi_command",
+            passed=passed,
+            expected="overall fail, output has both commands",
+            actual=f"passed={res.passed}",
+            duration_ms=round(elapsed_ms, 2),
+        )
+    )
+    if verbose:
+        console.print(f"  {'[green]✓[/green]' if passed else '[red]✗[/red]'} verify_multi_command")
+
+    if fast:
+        core_ids = {"verify_pass", "verify_fail", "verify_retry"}
+        result.details = [d for d in result.details if d.case_id in core_ids]
+        result.total = len(result.details)
+        result.passed = sum(1 for d in result.details if d.passed)
+        result.failed = result.total - result.passed
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Report generation
+# ---------------------------------------------------------------------------
+
+
+def _generate_json_report(
+    report_data: dict[str, Any],
+    output_path: Path,
+) -> None:
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(
+        json.dumps(report_data, indent=2, ensure_ascii=False),
+        encoding="utf-8",
+    )
+
+
+def _generate_txt_report(
+    report_data: dict[str, Any],
+    output_path: Path,
+) -> None:
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    lines: list[str] = []
+    lines.append("=" * 70)
+    lines.append("AgentKit Benchmark Report")
+    lines.append("=" * 70)
+    lines.append(f"Timestamp:      {report_data['timestamp']}")
+    lines.append(f"Version:        {report_data['version']}")
+    lines.append(f"Overall Score:  {report_data['overall_score']:.1%}")
+    lines.append(f"Summary:        {report_data['summary']}")
+    lines.append("")
+
+    lines.append("-" * 70)
+    lines.append(f"{'Dimension':<20} {'Total':>6} {'Pass':>6} {'Fail':>6} {'Score':>8}")
+    lines.append("-" * 70)
+
+    total_all = 0
+    pass_all = 0
+    fail_all = 0
+
+    for dim_name, dim_data in report_data["dimensions"].items():
+        total = dim_data["total"]
+        passed = dim_data["passed"]
+        failed = dim_data["failed"]
+        score = dim_data["score"]
+        lines.append(
+            f"{dim_name:<20} {total:>6} {passed:>6} {failed:>6} {score:>7.1%}"
+        )
+        total_all += total
+        pass_all += passed
+        fail_all += failed
+
+    lines.append("-" * 70)
+    overall = pass_all / total_all if total_all > 0 else 0.0
+    lines.append(
+        f"{'OVERALL':<20} {total_all:>6} {pass_all:>6} {fail_all:>6} {overall:>7.1%}"
+    )
+    lines.append("=" * 70)
+    lines.append("")
+
+    # Detailed failures
+    has_failures = False
+    for dim_name, dim_data in report_data["dimensions"].items():
+        failures = [d for d in dim_data["details"] if not d["passed"]]
+        if failures:
+            if not has_failures:
+                lines.append("Failed Cases:")
+                lines.append("-" * 70)
+                has_failures = True
+            for f in failures:
+                lines.append(f"  [{dim_name}] {f['case_id']}")
+                lines.append(f"    expected: {f['expected']}")
+                lines.append(f"    actual:   {f['actual']}")
+                if f.get("detail"):
+                    lines.append(f"    detail:   {f['detail']}")
+                lines.append("")
+
+    if not has_failures:
+        lines.append("All tests passed — no failures to report.")
+        lines.append("")
+
+    output_path.write_text("\n".join(lines), encoding="utf-8")
+
+
+def _generate_html_report(
+    report_data: dict[str, Any],
+    output_path: Path,
+) -> None:
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    rows_html: list[str] = []
+    total_all = 0
+    pass_all = 0
+    fail_all = 0
+
+    for dim_name, dim_data in report_data["dimensions"].items():
+        total = dim_data["total"]
+        passed = dim_data["passed"]
+        failed = dim_data["failed"]
+        score = dim_data["score"]
+        total_all += total
+        pass_all += passed
+        fail_all += failed
+
+        score_class = "score-good" if score >= 0.9 else "score-warn" if score >= 0.7 else "score-bad"
+        rows_html.append(
+            f"<tr>"
+            f"<td>{dim_name}</td>"
+            f"<td class='num'>{total}</td>"
+            f"<td class='num pass'>{passed}</td>"
+            f"<td class='num fail'>{failed}</td>"
+            f"<td class='num {score_class}'>{score:.1%}</td>"
+            f"</tr>"
+        )
+
+    overall = pass_all / total_all if total_all > 0 else 0.0
+    overall_class = (
+        "score-good" if overall >= 0.9 else "score-warn" if overall >= 0.7 else "score-bad"
+    )
+    rows_html.append(
+        f"<tr class='overall-row'>"
+        f"<td><strong>OVERALL</strong></td>"
+        f"<td class='num'><strong>{total_all}</strong></td>"
+        f"<td class='num pass'><strong>{pass_all}</strong></td>"
+        f"<td class='num fail'><strong>{fail_all}</strong></td>"
+        f"<td class='num {overall_class}'><strong>{overall:.1%}</strong></td>"
+        f"</tr>"
+    )
+
+    # Failure details
+    failure_html: list[str] = []
+    for dim_name, dim_data in report_data["dimensions"].items():
+        failures = [d for d in dim_data["details"] if not d["passed"]]
+        for f in failures:
+            failure_html.append(
+                f"<div class='failure'>"
+                f"<span class='dim'>[{dim_name}]</span> "
+                f"<span class='case'>{f['case_id']}</span>"
+                f"<div class='detail'>expected: {f['expected']}</div>"
+                f"<div class='detail'>actual: {f['actual']}</div>"
+                f"</div>"
+            )
+
+    failures_section = (
+        "<h2>Failed Cases</h2>" + "".join(failure_html)
+        if failure_html
+        else "<p class='all-pass'>All tests passed.</p>"
+    )
+
+    html = f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<title>AgentKit Benchmark Report</title>
+<style>
+  body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; margin: 2em; }}
+  h1 {{ color: #1a1a2e; }}
+  .meta {{ color: #666; margin-bottom: 1em; }}
+  table {{ border-collapse: collapse; width: 100%; margin: 1em 0; }}
+  th, td {{ border: 1px solid #ddd; padding: 8px 12px; text-align: left; }}
+  th {{ background-color: #1a1a2e; color: white; }}
+  td.num {{ text-align: right; font-family: monospace; }}
+  td.pass {{ color: #2e7d32; }}
+  td.fail {{ color: #c62828; }}
+  .score-good {{ color: #2e7d32; font-weight: bold; }}
+  .score-warn {{ color: #e65100; font-weight: bold; }}
+  .score-bad {{ color: #c62828; font-weight: bold; }}
+  .overall-row {{ background-color: #f5f5f5; }}
+  .failure {{ margin: 0.5em 0; padding: 0.5em; background: #fff3e0; border-left: 3px solid #ff9800; }}
+  .failure .dim {{ color: #e65100; font-weight: bold; }}
+  .failure .case {{ font-family: monospace; }}
+  .failure .detail {{ font-size: 0.85em; color: #555; margin-left: 1em; }}
+  .all-pass {{ color: #2e7d32; font-weight: bold; }}
+</style>
+</head>
+<body>
+<h1>AgentKit Benchmark Report</h1>
+<div class="meta">
+  <p>Timestamp: {report_data['timestamp']}</p>
+  <p>Version: {report_data['version']}</p>
+  <p>Overall Score: <strong>{overall:.1%}</strong></p>
+  <p>Summary: {report_data['summary']}</p>
+</div>
+<h2>Dimension Results</h2>
+<table>
+<thead><tr><th>Dimension</th><th>Total</th><th>Pass</th><th>Fail</th><th>Score</th></tr></thead>
+<tbody>
+{"".join(rows_html)}
+</tbody>
+</table>
+{failures_section}
+</body>
+</html>"""
+
+    output_path.write_text(html, encoding="utf-8")
+
+
+# ---------------------------------------------------------------------------
+# Main command
+# ---------------------------------------------------------------------------
+
+
+def _get_version() -> str:
+    try:
+        from importlib.metadata import version as get_version
+
+        return get_version("fischer-agentkit")
+    except Exception:
+        return "0.1.0 (dev)"
+
+
+def _build_summary_table(results: dict[str, DimensionResult]) -> Table:
+    table = Table(title="AgentKit Benchmark Results", show_lines=True)
+    table.add_column("Dimension", style="cyan", no_wrap=True)
+    table.add_column("Total", justify="right", style="white")
+    table.add_column("Pass", justify="right", style="green")
+    table.add_column("Fail", justify="right", style="red")
+    table.add_column("Score", justify="right", style="magenta")
+
+    total_all = 0
+    pass_all = 0
+    fail_all = 0
+
+    for dim_name, dim_result in results.items():
+        table.add_row(
+            dim_name,
+            str(dim_result.total),
+            str(dim_result.passed),
+            str(dim_result.failed),
+            f"{dim_result.score:.1%}",
+        )
+        total_all += dim_result.total
+        pass_all += dim_result.passed
+        fail_all += dim_result.failed
+
+    overall = pass_all / total_all if total_all > 0 else 0.0
+    table.add_row(
+        "[bold]OVERALL[/bold]",
+        f"[bold]{total_all}[/bold]",
+        f"[bold green]{pass_all}[/bold green]",
+        f"[bold red]{fail_all}[/bold red]",
+        f"[bold magenta]{overall:.1%}[/bold magenta]",
+    )
+
+    return table
+
+
+def benchmark(
+    dimension: BenchmarkDimension = typer.Option(
+        BenchmarkDimension.ALL,
+        "--dimension",
+        "-d",
+        help="Benchmark dimension to run (default: all)",
+    ),
+    report: bool = typer.Option(False, "--report", help="Generate JSON + TXT report files"),
+    format: str = typer.Option(
+        "json",
+        "--format",
+        "-f",
+        help="Report format: json, txt, or html (use with --report)",
+    ),
+    output_dir: str = typer.Option(
+        _DEFAULT_OUTPUT_DIR,
+        "--output-dir",
+        "-o",
+        help="Directory for report output files",
+    ),
+    fast: bool = typer.Option(False, "--fast", help="Run only core test cases"),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Show detailed output"),
+):
+    """Run AgentKit capability benchmarks and generate reports.
+
+    Tests core components directly (no LLM, no pytest subprocess):
+    preprocessing, overfitting, efficiency, tool_search, event_model,
+    spec_management, verification.
+    """
+    import tempfile
+
+    # Normalize dimension to enum (Typer may pass string)
+    if isinstance(dimension, str):
+        dimension = BenchmarkDimension(dimension)
+
+    console.print()
+    console.print(
+        Panel.fit(
+            "[bold cyan]AgentKit Benchmark[/bold cyan]\n"
+            f"Dimension: [yellow]{dimension.value}[/yellow]  "
+            f"Fast: [yellow]{fast}[/yellow]  "
+            f"Verbose: [yellow]{verbose}[/yellow]",
+            border_style="cyan",
+        )
+    )
+    console.print()
+
+    # Determine which dimensions to run
+    if dimension == BenchmarkDimension.ALL:
+        dims_to_run = [
+            BenchmarkDimension.PREPROCESSING,
+            BenchmarkDimension.OVERFITTING,
+            BenchmarkDimension.EFFICIENCY,
+            BenchmarkDimension.TOOL_SEARCH,
+            BenchmarkDimension.EVENT_MODEL,
+            BenchmarkDimension.SPEC_MANAGEMENT,
+            BenchmarkDimension.VERIFICATION,
+        ]
+    else:
+        dims_to_run = [dimension]
+
+    # Map dimension enum to runner functions
+    runner_map: dict[BenchmarkDimension, Any] = {
+        BenchmarkDimension.PREPROCESSING: _run_preprocessing,
+        BenchmarkDimension.OVERFITTING: _run_overfitting,
+        BenchmarkDimension.EFFICIENCY: _run_efficiency,
+        BenchmarkDimension.TOOL_SEARCH: _run_tool_search,
+        BenchmarkDimension.EVENT_MODEL: _run_event_model,
+        BenchmarkDimension.SPEC_MANAGEMENT: _run_spec_management,
+        BenchmarkDimension.VERIFICATION: _run_verification,
+    }
+
+    results: dict[str, DimensionResult] = {}
+
+    with tempfile.TemporaryDirectory(prefix="agentkit-benchmark-") as tmp:
+        tmp_path = Path(tmp)
+
+        with Progress(
+            SpinnerColumn(),
+            TextColumn("[progress.description]{task.description}"),
+            BarColumn(),
+            TaskProgressColumn(),
+            console=console,
+        ) as progress:
+            for dim in dims_to_run:
+                task = progress.add_task(
+                    f"Running {dim.value}...", total=None
+                )
+                runner = runner_map[dim]
+
+                # spec_management and verification need tmp_path
+                if dim in (BenchmarkDimension.SPEC_MANAGEMENT, BenchmarkDimension.VERIFICATION):
+                    dim_result = asyncio.run(runner(fast, verbose, tmp_path))
+                else:
+                    dim_result = asyncio.run(runner(fast, verbose))
+
+                results[dim.value] = dim_result
+                progress.update(task, completed=True, total=1)
+
+    # Display summary table
+    console.print()
+    table = _build_summary_table(results)
+    console.print(table)
+    console.print()
+
+    # Compute overall
+    total_all = sum(r.total for r in results.values())
+    pass_all = sum(r.passed for r in results.values())
+    fail_all = sum(r.failed for r in results.values())
+    overall_score = pass_all / total_all if total_all > 0 else 0.0
+
+    if fail_all == 0:
+        summary = f"All {pass_all} tests passed across {len(results)} dimensions."
+        console.print(f"[bold green]✓ {summary}[/bold green]")
+    else:
+        summary = (
+            f"{pass_all}/{total_all} tests passed ({fail_all} failed) "
+            f"across {len(results)} dimensions."
+        )
+        console.print(f"[bold yellow]⚠ {summary}[/bold yellow]")
+
+    console.print()
+
+    # Generate reports
+    if report:
+        out_path = Path(output_dir)
+        out_path.mkdir(parents=True, exist_ok=True)
+
+        timestamp = datetime.now(timezone.utc).isoformat()
+        version = _get_version()
+
+        report_data: dict[str, Any] = {
+            "timestamp": timestamp,
+            "version": version,
+            "dimensions": {name: r.to_dict() for name, r in results.items()},
+            "overall_score": round(overall_score, 4),
+            "summary": summary,
+        }
+
+        # Always generate JSON
+        json_path = out_path / "benchmark_report.json"
+        _generate_json_report(report_data, json_path)
+        console.print(f"[green]JSON report:[/green] {json_path}")
+
+        # Always generate TXT
+        txt_path = out_path / "benchmark_report.txt"
+        _generate_txt_report(report_data, txt_path)
+        console.print(f"[green]TXT report:[/green] {txt_path}")
+
+        # Generate HTML if requested
+        if format.lower() == "html":
+            html_path = out_path / "benchmark_report.html"
+            _generate_html_report(report_data, html_path)
+            console.print(f"[green]HTML report:[/green] {html_path}")
+
+        console.print()
+
+    # Exit with non-zero code if any tests failed
+    if fail_all > 0:
+        raise typer.Exit(code=1)
diff --git a/src/agentkit/cli/main.py b/src/agentkit/cli/main.py
index 483e6a9..60ddadb 100644
--- a/src/agentkit/cli/main.py
+++ b/src/agentkit/cli/main.py
@@ -35,6 +35,10 @@ from agentkit.cli.chat import chat  # noqa: E402
 
 app.command(name="chat")(chat)
 
+from agentkit.cli.benchmark import benchmark  # noqa: E402
+
+app.command(name="benchmark")(benchmark)
+
 
 @app.command()
 def gui(
diff --git a/test-results/benchmark/benchmark_report.html b/test-results/benchmark/benchmark_report.html
new file mode 100644
index 0000000..161f0b3
--- /dev/null
+++ b/test-results/benchmark/benchmark_report.html
@@ -0,0 +1,44 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<title>AgentKit Benchmark Report</title>
+<style>
+  body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; margin: 2em; }
+  h1 { color: #1a1a2e; }
+  .meta { color: #666; margin-bottom: 1em; }
+  table { border-collapse: collapse; width: 100%; margin: 1em 0; }
+  th, td { border: 1px solid #ddd; padding: 8px 12px; text-align: left; }
+  th { background-color: #1a1a2e; color: white; }
+  td.num { text-align: right; font-family: monospace; }
+  td.pass { color: #2e7d32; }
+  td.fail { color: #c62828; }
+  .score-good { color: #2e7d32; font-weight: bold; }
+  .score-warn { color: #e65100; font-weight: bold; }
+  .score-bad { color: #c62828; font-weight: bold; }
+  .overall-row { background-color: #f5f5f5; }
+  .failure { margin: 0.5em 0; padding: 0.5em; background: #fff3e0; border-left: 3px solid #ff9800; }
+  .failure .dim { color: #e65100; font-weight: bold; }
+  .failure .case { font-family: monospace; }
+  .failure .detail { font-size: 0.85em; color: #555; margin-left: 1em; }
+  .all-pass { color: #2e7d32; font-weight: bold; }
+</style>
+</head>
+<body>
+<h1>AgentKit Benchmark Report</h1>
+<div class="meta">
+  <p>Timestamp: 2026-06-17T03:26:25.072956+00:00</p>
+  <p>Version: 0.1.0</p>
+  <p>Overall Score: <strong>98.0%</strong></p>
+  <p>Summary: 50/51 tests passed (1 failed) across 7 dimensions.</p>
+</div>
+<h2>Dimension Results</h2>
+<table>
+<thead><tr><th>Dimension</th><th>Total</th><th>Pass</th><th>Fail</th><th>Score</th></tr></thead>
+<tbody>
+<tr><td>preprocessing</td><td class='num'>15</td><td class='num pass'>14</td><td class='num fail'>1</td><td class='num score-good'>93.3%</td></tr><tr><td>overfitting</td><td class='num'>3</td><td class='num pass'>3</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>efficiency</td><td class='num'>5</td><td class='num pass'>5</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>tool_search</td><td class='num'>10</td><td class='num pass'>10</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>event_model</td><td class='num'>6</td><td class='num pass'>6</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>spec_management</td><td class='num'>7</td><td class='num pass'>7</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>verification</td><td class='num'>5</td><td class='num pass'>5</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr class='overall-row'><td><strong>OVERALL</strong></td><td class='num'><strong>51</strong></td><td class='num pass'><strong>50</strong></td><td class='num fail'><strong>1</strong></td><td class='num score-good'><strong>98.0%</strong></td></tr>
+</tbody>
+</table>
+<h2>Failed Cases</h2><div class='failure'><span class='dim'>[preprocessing]</span> <span class='case'>skill_prefix_direct</span><div class='detail'>expected: skill_react</div><div class='detail'>actual: direct_chat</div></div>
+</body>
+</html>
\ No newline at end of file
diff --git a/test-results/benchmark/benchmark_report.json b/test-results/benchmark/benchmark_report.json
new file mode 100644
index 0000000..c63b01b
--- /dev/null
+++ b/test-results/benchmark/benchmark_report.json
@@ -0,0 +1,472 @@
+{
+  "timestamp": "2026-06-17T03:26:25.072956+00:00",
+  "version": "0.1.0",
+  "dimensions": {
+    "preprocessing": {
+      "score": 0.9333,
+      "total": 15,
+      "passed": 14,
+      "failed": 1,
+      "details": [
+        {
+          "case_id": "greeting_cn",
+          "passed": true,
+          "expected": "direct_chat",
+          "actual": "direct_chat",
+          "duration_ms": 0.03,
+          "detail": "input='你好' method=regex_direct"
+        },
+        {
+          "case_id": "greeting_en",
+          "passed": true,
+          "expected": "direct_chat",
+          "actual": "direct_chat",
+          "duration_ms": 0.02,
+          "detail": "input='hello' method=regex_direct"
+        },
+        {
+          "case_id": "chitchat_thanks",
+          "passed": true,
+          "expected": "direct_chat",
+          "actual": "direct_chat",
+          "duration_ms": 0.01,
+          "detail": "input='谢谢' method=regex_direct"
+        },
+        {
+          "case_id": "identity_who",
+          "passed": true,
+          "expected": "direct_chat",
+          "actual": "direct_chat",
+          "duration_ms": 0.02,
+          "detail": "input='你是谁' method=regex_direct"
+        },
+        {
+          "case_id": "colloquial_ip_1",
+          "passed": true,
+          "expected": "react",
+          "actual": "react",
+          "duration_ms": 0.02,
+          "detail": "input='查下ip' method=default_react"
+        },
+        {
+          "case_id": "colloquial_ip_2",
+          "passed": true,
+          "expected": "react",
+          "actual": "react",
+          "duration_ms": 0.01,
+          "detail": "input='查看当前ip' method=default_react"
+        },
+        {
+          "case_id": "tool_search",
+          "passed": true,
+          "expected": "react",
+          "actual": "react",
+          "duration_ms": 0.01,
+          "detail": "input='搜索golang教程' method=default_react"
+        },
+        {
+          "case_id": "tool_shell",
+          "passed": true,
+          "expected": "react",
+          "actual": "react",
+          "duration_ms": 0.01,
+          "detail": "input='执行ls命令' method=default_react"
+        },
+        {
+          "case_id": "translation",
+          "passed": true,
+          "expected": "react",
+          "actual": "react",
+          "duration_ms": 0.01,
+          "detail": "input='翻译hello为中文' method=default_react"
+        },
+        {
+          "case_id": "knowledge",
+          "passed": true,
+          "expected": "react",
+          "actual": "react",
+          "duration_ms": 0.01,
+          "detail": "input='什么是机器学习' method=default_react"
+        },
+        {
+          "case_id": "skill_prefix_react",
+          "passed": true,
+          "expected": "skill_react",
+          "actual": "skill_react",
+          "duration_ms": 0.03,
+          "detail": "input='@skill:react_agent 查看ip' method=skill_prefix"
+        },
+        {
+          "case_id": "skill_prefix_direct",
+          "passed": false,
+          "expected": "skill_react",
+          "actual": "direct_chat",
+          "duration_ms": 0.02,
+          "detail": "input='@skill:chat_only 你好' method=skill_prefix"
+        },
+        {
+          "case_id": "skill_not_found",
+          "passed": true,
+          "expected": "react",
+          "actual": "react",
+          "duration_ms": 0.13,
+          "detail": "input='@skill:nonexistent 做点什么' method=skill_not_found_fallback"
+        },
+        {
+          "case_id": "complex_analysis",
+          "passed": true,
+          "expected": "react",
+          "actual": "react",
+          "duration_ms": 0.01,
+          "detail": "input='帮我分析一下这个数据并生成报告' method=default_react"
+        },
+        {
+          "case_id": "empty_fallback",
+          "passed": true,
+          "expected": "react",
+          "actual": "react",
+          "duration_ms": 0.01,
+          "detail": "input='随便聊聊' method=default_react"
+        }
+      ]
+    },
+    "overfitting": {
+      "score": 1.0,
+      "total": 3,
+      "passed": 3,
+      "failed": 0,
+      "details": [
+        {
+          "case_id": "ip_check_variants",
+          "passed": true,
+          "expected": "react",
+          "actual": "react,react,react,react,react",
+          "duration_ms": 0.0,
+          "detail": "paraphrases=5 consistent=True"
+        },
+        {
+          "case_id": "search_variants",
+          "passed": true,
+          "expected": "react",
+          "actual": "react,react,react",
+          "duration_ms": 0.0,
+          "detail": "paraphrases=3 consistent=True"
+        },
+        {
+          "case_id": "greeting_variants",
+          "passed": true,
+          "expected": "direct_chat",
+          "actual": "direct_chat,direct_chat,direct_chat,direct_chat,direct_chat",
+          "duration_ms": 0.0,
+          "detail": "paraphrases=5 consistent=True"
+        }
+      ]
+    },
+    "efficiency": {
+      "score": 1.0,
+      "total": 5,
+      "passed": 5,
+      "failed": 0,
+      "details": [
+        {
+          "case_id": "preprocess_greeting",
+          "passed": true,
+          "expected": "<= 50.0ms/call",
+          "actual": "0.004ms/call",
+          "duration_ms": 0.44,
+          "detail": "iterations=100"
+        },
+        {
+          "case_id": "preprocess_react",
+          "passed": true,
+          "expected": "<= 50.0ms/call",
+          "actual": "0.004ms/call",
+          "duration_ms": 0.38,
+          "detail": "iterations=100"
+        },
+        {
+          "case_id": "preprocess_skill_prefix",
+          "passed": true,
+          "expected": "<= 50.0ms/call",
+          "actual": "0.005ms/call",
+          "duration_ms": 0.51,
+          "detail": "iterations=100"
+        },
+        {
+          "case_id": "tool_search_query",
+          "passed": true,
+          "expected": "<= 10.0ms/call",
+          "actual": "0.008ms/call",
+          "duration_ms": 1.69,
+          "detail": "iterations=200"
+        },
+        {
+          "case_id": "tool_search_empty",
+          "passed": true,
+          "expected": "<= 5.0ms/call",
+          "actual": "0.000ms/call",
+          "duration_ms": 0.08,
+          "detail": "iterations=200"
+        }
+      ]
+    },
+    "tool_search": {
+      "score": 1.0,
+      "total": 10,
+      "passed": 10,
+      "failed": 0,
+      "details": [
+        {
+          "case_id": "read_file_query",
+          "passed": true,
+          "expected": "read_file",
+          "actual": "read_file",
+          "duration_ms": 0.02,
+          "detail": "query='read file' top_k=5 results=2"
+        },
+        {
+          "case_id": "write_file_query",
+          "passed": true,
+          "expected": "write_file",
+          "actual": "write_file",
+          "duration_ms": 0.02,
+          "detail": "query='write file content' top_k=5 results=2"
+        },
+        {
+          "case_id": "web_search_query",
+          "passed": true,
+          "expected": "web_search",
+          "actual": "web_search",
+          "duration_ms": 0.02,
+          "detail": "query='search web information' top_k=5 results=2"
+        },
+        {
+          "case_id": "shell_exec_query",
+          "passed": true,
+          "expected": "shell_exec",
+          "actual": "shell_exec",
+          "duration_ms": 0.02,
+          "detail": "query='execute shell command' top_k=5 results=1"
+        },
+        {
+          "case_id": "http_request_query",
+          "passed": true,
+          "expected": "http_request",
+          "actual": "http_request",
+          "duration_ms": 0.03,
+          "detail": "query='send http request url' top_k=5 results=1"
+        },
+        {
+          "case_id": "file_tag_query",
+          "passed": true,
+          "expected": "read_file",
+          "actual": "read_file",
+          "duration_ms": 0.02,
+          "detail": "query='io file' top_k=5 results=2"
+        },
+        {
+          "case_id": "empty_query",
+          "passed": true,
+          "expected": "__none__",
+          "actual": "[]",
+          "duration_ms": 0.0,
+          "detail": "query='' top_k=5 results=0"
+        },
+        {
+          "case_id": "no_match_query",
+          "passed": true,
+          "expected": "__none__",
+          "actual": "[]",
+          "duration_ms": 0.01,
+          "detail": "query='zzzznonexistent' top_k=5 results=0"
+        },
+        {
+          "case_id": "top_k_limit",
+          "passed": true,
+          "expected": "read_file",
+          "actual": "read_file",
+          "duration_ms": 0.02,
+          "detail": "query='file' top_k=1 results=1"
+        },
+        {
+          "case_id": "multi_token_query",
+          "passed": true,
+          "expected": "web_search",
+          "actual": "web_search",
+          "duration_ms": 0.03,
+          "detail": "query='search query engine' top_k=5 results=1"
+        }
+      ]
+    },
+    "event_model": {
+      "score": 1.0,
+      "total": 6,
+      "passed": 6,
+      "failed": 0,
+      "details": [
+        {
+          "case_id": "sq_submit_drain",
+          "passed": true,
+          "expected": "task_id + drained=['hello']",
+          "actual": "task_id=571839fb... drained=['hello']",
+          "duration_ms": 0.1,
+          "detail": ""
+        },
+        {
+          "case_id": "sq_cancel",
+          "passed": true,
+          "expected": "cancelled=True",
+          "actual": "cancelled=True",
+          "duration_ms": 0.04,
+          "detail": ""
+        },
+        {
+          "case_id": "sq_close_blocks",
+          "passed": true,
+          "expected": "RuntimeError on submit after close",
+          "actual": "raised=True closed=True",
+          "duration_ms": 0.02,
+          "detail": ""
+        },
+        {
+          "case_id": "eq_emit_subscribe_replay",
+          "passed": true,
+          "expected": "1 event replayed",
+          "actual": "1 events",
+          "duration_ms": 0.07,
+          "detail": ""
+        },
+        {
+          "case_id": "eq_close_sentinel",
+          "passed": true,
+          "expected": "subscriber exits on close",
+          "actual": "1 events, closed=True",
+          "duration_ms": 21.59,
+          "detail": ""
+        },
+        {
+          "case_id": "eq_subscriber_count",
+          "passed": true,
+          "expected": "0 subscribers initially",
+          "actual": "0 subscribers",
+          "duration_ms": 0.01,
+          "detail": ""
+        }
+      ]
+    },
+    "spec_management": {
+      "score": 1.0,
+      "total": 7,
+      "passed": 7,
+      "failed": 0,
+      "details": [
+        {
+          "case_id": "spec_create",
+          "passed": true,
+          "expected": "file exists on disk",
+          "actual": "exists=True",
+          "duration_ms": 2.24,
+          "detail": ""
+        },
+        {
+          "case_id": "spec_get",
+          "passed": true,
+          "expected": "spec with 2 steps",
+          "actual": "steps=2",
+          "duration_ms": 0.0,
+          "detail": ""
+        },
+        {
+          "case_id": "spec_update",
+          "passed": true,
+          "expected": "goal='Updated goal'",
+          "actual": "goal=Updated goal",
+          "duration_ms": 1.75,
+          "detail": ""
+        },
+        {
+          "case_id": "spec_confirm",
+          "passed": true,
+          "expected": "status=confirmed, all steps confirmed",
+          "actual": "status=confirmed",
+          "duration_ms": 1.86,
+          "detail": ""
+        },
+        {
+          "case_id": "spec_list",
+          "passed": true,
+          "expected": "2 specs",
+          "actual": "2 specs",
+          "duration_ms": 4.92,
+          "detail": ""
+        },
+        {
+          "case_id": "spec_delete",
+          "passed": true,
+          "expected": "deleted, 1 remaining",
+          "actual": "deleted=True, remaining=1",
+          "duration_ms": 1.94,
+          "detail": ""
+        },
+        {
+          "case_id": "spec_get_missing",
+          "passed": true,
+          "expected": "None",
+          "actual": "None",
+          "duration_ms": 0.06,
+          "detail": ""
+        }
+      ]
+    },
+    "verification": {
+      "score": 1.0,
+      "total": 5,
+      "passed": 5,
+      "failed": 0,
+      "details": [
+        {
+          "case_id": "verify_pass",
+          "passed": true,
+          "expected": "passed=True, attempts=1",
+          "actual": "passed=True, attempts=1",
+          "duration_ms": 11.82,
+          "detail": ""
+        },
+        {
+          "case_id": "verify_fail",
+          "passed": true,
+          "expected": "passed=False, has errors",
+          "actual": "passed=False, errors=1",
+          "duration_ms": 9.8,
+          "detail": ""
+        },
+        {
+          "case_id": "verify_retry",
+          "passed": true,
+          "expected": "attempts=3, fix_callback called 2x",
+          "actual": "attempts=3, callbacks=2",
+          "duration_ms": 33.87,
+          "detail": ""
+        },
+        {
+          "case_id": "verify_timeout",
+          "passed": true,
+          "expected": "timeout error",
+          "actual": "passed=False, errors=1",
+          "duration_ms": 506.8,
+          "detail": ""
+        },
+        {
+          "case_id": "verify_multi_command",
+          "passed": true,
+          "expected": "overall fail, output has both commands",
+          "actual": "passed=False",
+          "duration_ms": 23.12,
+          "detail": ""
+        }
+      ]
+    }
+  },
+  "overall_score": 0.9804,
+  "summary": "50/51 tests passed (1 failed) across 7 dimensions."
+}
\ No newline at end of file
diff --git a/test-results/benchmark/benchmark_report.txt b/test-results/benchmark/benchmark_report.txt
new file mode 100644
index 0000000..7b8c1f0
--- /dev/null
+++ b/test-results/benchmark/benchmark_report.txt
@@ -0,0 +1,28 @@
+======================================================================
+AgentKit Benchmark Report
+======================================================================
+Timestamp:      2026-06-17T03:26:25.072956+00:00
+Version:        0.1.0
+Overall Score:  98.0%
+Summary:        50/51 tests passed (1 failed) across 7 dimensions.
+
+----------------------------------------------------------------------
+Dimension             Total   Pass   Fail    Score
+----------------------------------------------------------------------
+preprocessing            15     14      1   93.3%
+overfitting               3      3      0  100.0%
+efficiency                5      5      0  100.0%
+tool_search              10     10      0  100.0%
+event_model               6      6      0  100.0%
+spec_management           7      7      0  100.0%
+verification              5      5      0  100.0%
+----------------------------------------------------------------------
+OVERALL                  51     50      1   98.0%
+======================================================================
+
+Failed Cases:
+----------------------------------------------------------------------
+  [preprocessing] skill_prefix_direct
+    expected: skill_react
+    actual:   direct_chat
+    detail:   input='@skill:chat_only 你好' method=skill_prefix
diff --git a/test-results/e2e/comprehensive_report.json b/test-results/e2e/comprehensive_report.json
new file mode 100644
index 0000000..df1b379
--- /dev/null
+++ b/test-results/e2e/comprehensive_report.json
@@ -0,0 +1,334 @@
+{
+  "report_type": "comprehensive_capability_backtest",
+  "generated_at": "2026-06-17T03:22:42.152439+00:00",
+  "total_score": 100.0,
+  "total_cases": 50,
+  "total_passed": 50,
+  "dimension_scores": {
+    "preprocessing_accuracy": 100.0,
+    "skill_recall": 100.0,
+    "overfitting_detection": 100.0,
+    "execution_efficiency": 100.0,
+    "tool_search_accuracy": 100.0,
+    "event_model_integrity": 100.0,
+    "spec_management": 100.0,
+    "verification_loop": 100.0
+  },
+  "dimension_details": {
+    "preprocessing_accuracy": {
+      "total": 17,
+      "passed": 17,
+      "score": 100.0,
+      "cases": [
+        {
+          "case_id": "greeting_cn",
+          "passed": true,
+          "input": "你好",
+          "expected": "direct_chat",
+          "actual": "direct_chat"
+        },
+        {
+          "case_id": "greeting_en",
+          "passed": true,
+          "input": "hello",
+          "expected": "direct_chat",
+          "actual": "direct_chat"
+        },
+        {
+          "case_id": "greeting_hi",
+          "passed": true,
+          "input": "hi",
+          "expected": "direct_chat",
+          "actual": "direct_chat"
+        },
+        {
+          "case_id": "chitchat_thanks",
+          "passed": true,
+          "input": "谢谢",
+          "expected": "direct_chat",
+          "actual": "direct_chat"
+        },
+        {
+          "case_id": "chitchat_ok",
+          "passed": true,
+          "input": "好的",
+          "expected": "direct_chat",
+          "actual": "direct_chat"
+        },
+        {
+          "case_id": "identity_who",
+          "passed": true,
+          "input": "你是谁",
+          "expected": "direct_chat",
+          "actual": "direct_chat"
+        },
+        {
+          "case_id": "identity_name",
+          "passed": true,
+          "input": "你叫什么",
+          "expected": "direct_chat",
+          "actual": "direct_chat"
+        },
+        {
+          "case_id": "tool_ip",
+          "passed": true,
+          "input": "查下ip",
+          "expected": "react",
+          "actual": "react"
+        },
+        {
+          "case_id": "tool_search",
+          "passed": true,
+          "input": "搜索golang教程",
+          "expected": "react",
+          "actual": "react"
+        },
+        {
+          "case_id": "tool_shell",
+          "passed": true,
+          "input": "执行ls命令",
+          "expected": "react",
+          "actual": "react"
+        },
+        {
+          "case_id": "tool_file",
+          "passed": true,
+          "input": "读一下配置文件",
+          "expected": "react",
+          "actual": "react"
+        },
+        {
+          "case_id": "tool_monitor",
+          "passed": true,
+          "input": "检查服务状态",
+          "expected": "react",
+          "actual": "react"
+        },
+        {
+          "case_id": "complex_analysis",
+          "passed": true,
+          "input": "帮我分析一下这个数据并生成报告",
+          "expected": "react",
+          "actual": "react"
+        },
+        {
+          "case_id": "complex_code",
+          "passed": true,
+          "input": "重构这个函数使其更高效",
+          "expected": "react",
+          "actual": "react"
+        },
+        {
+          "case_id": "complex_multi",
+          "passed": true,
+          "input": "搜索最新的AI论文并总结关键发现",
+          "expected": "react",
+          "actual": "react"
+        },
+        {
+          "case_id": "skill_prefix_react",
+          "passed": true,
+          "input": "@skill:react_agent 查看当前ip",
+          "expected": "skill_react",
+          "actual": "skill_react"
+        },
+        {
+          "case_id": "skill_prefix_coder",
+          "passed": true,
+          "input": "@skill:coder 写一个函数",
+          "expected": "skill_react",
+          "actual": "skill_react"
+        }
+      ]
+    },
+    "skill_recall": {
+      "total": 8,
+      "passed": 8,
+      "score": 100.0,
+      "cases": [
+        {
+          "case_id": "recall_valid_react",
+          "passed": true
+        },
+        {
+          "case_id": "recall_valid_coder",
+          "passed": true
+        },
+        {
+          "case_id": "recall_invalid_skill",
+          "passed": true
+        },
+        {
+          "case_id": "recall_no_prefix_react",
+          "passed": true
+        },
+        {
+          "case_id": "recall_no_prefix_greeting",
+          "passed": true
+        },
+        {
+          "case_id": "recall_no_prefix_complex",
+          "passed": true
+        },
+        {
+          "case_id": "recall_skill_only_prefix",
+          "passed": true
+        },
+        {
+          "case_id": "recall_skill_with_long_content",
+          "passed": true
+        }
+      ]
+    },
+    "overfitting_detection": {
+      "total": 5,
+      "passed": 5,
+      "score": 100.0,
+      "cases": [
+        {
+          "case_id": "overfit_ip_check",
+          "passed": true
+        },
+        {
+          "case_id": "overfit_search",
+          "passed": true
+        },
+        {
+          "case_id": "overfit_greeting",
+          "passed": true
+        },
+        {
+          "case_id": "overfit_file_read",
+          "passed": true
+        },
+        {
+          "case_id": "overfit_identity",
+          "passed": true
+        }
+      ]
+    },
+    "execution_efficiency": {
+      "total": 5,
+      "passed": 5,
+      "score": 100.0,
+      "cases": [
+        {
+          "case_id": "efficiency_greeting",
+          "passed": true,
+          "elapsed_ms": 0.41
+        },
+        {
+          "case_id": "efficiency_chitchat",
+          "passed": true,
+          "elapsed_ms": 0.47
+        },
+        {
+          "case_id": "efficiency_identity",
+          "passed": true,
+          "elapsed_ms": 0.48
+        },
+        {
+          "case_id": "efficiency_react_tool",
+          "passed": true,
+          "elapsed_ms": 0.49
+        },
+        {
+          "case_id": "efficiency_react_complex",
+          "passed": true,
+          "elapsed_ms": 0.55
+        }
+      ]
+    },
+    "tool_search_accuracy": {
+      "total": 8,
+      "passed": 8,
+      "score": 100.0,
+      "cases": [
+        {
+          "case_id": "tool_search_read",
+          "passed": true
+        },
+        {
+          "case_id": "tool_search_write",
+          "passed": true
+        },
+        {
+          "case_id": "tool_search_web",
+          "passed": true
+        },
+        {
+          "case_id": "tool_search_shell",
+          "passed": true
+        },
+        {
+          "case_id": "tool_search_tests",
+          "passed": true
+        },
+        {
+          "case_id": "tool_search_file_multiple",
+          "passed": true
+        },
+        {
+          "case_id": "tool_search_no_match",
+          "passed": true
+        },
+        {
+          "case_id": "tool_search_empty_query",
+          "passed": true
+        }
+      ]
+    },
+    "event_model_integrity": {
+      "total": 3,
+      "passed": 3,
+      "score": 100.0,
+      "cases": [
+        {
+          "case_id": "sq_submit_and_drain",
+          "passed": true
+        },
+        {
+          "case_id": "eq_emit_and_subscribe",
+          "passed": true
+        },
+        {
+          "case_id": "event_type_classification",
+          "passed": true
+        }
+      ]
+    },
+    "spec_management": {
+      "total": 2,
+      "passed": 2,
+      "score": 100.0,
+      "cases": [
+        {
+          "case_id": "spec_create_and_get",
+          "passed": true
+        },
+        {
+          "case_id": "spec_confirm",
+          "passed": true
+        }
+      ]
+    },
+    "verification_loop": {
+      "total": 2,
+      "passed": 2,
+      "score": 100.0,
+      "cases": [
+        {
+          "case_id": "verify_success",
+          "passed": true
+        },
+        {
+          "case_id": "verify_failure",
+          "passed": true
+        }
+      ]
+    }
+  },
+  "suggestions": [
+    "所有维度均达到 100%，架构状态良好"
+  ]
+}
\ No newline at end of file
diff --git a/test-results/e2e/comprehensive_report.txt b/test-results/e2e/comprehensive_report.txt
new file mode 100644
index 0000000..8335472
--- /dev/null
+++ b/test-results/e2e/comprehensive_report.txt
@@ -0,0 +1,95 @@
+======================================================================
+Fischer AgentKit 综合能力回测报告
+======================================================================
+生成时间: 2026-06-17T03:22:42.152439+00:00
+总体评分: 100.0%
+用例总数: 50  通过: 50  失败: 0
+
+----------------------------------------------------------------------
+各维度得分
+----------------------------------------------------------------------
+  ✓ 预处理准确度: 100.0% (17/17)
+  ✓ 技能召回率: 100.0% (8/8)
+  ✓ 过拟合检测: 100.0% (5/5)
+  ✓ 执行效率: 100.0% (5/5)
+  ✓ 工具搜索准确度: 100.0% (8/8)
+  ✓ 事件模型完整性: 100.0% (3/3)
+  ✓ Spec 管理功能: 100.0% (2/2)
+  ✓ 验证循环: 100.0% (2/2)
+
+----------------------------------------------------------------------
+详细用例结果
+----------------------------------------------------------------------
+
+[预处理准确度]
+  ✓ greeting_cn
+  ✓ greeting_en
+  ✓ greeting_hi
+  ✓ chitchat_thanks
+  ✓ chitchat_ok
+  ✓ identity_who
+  ✓ identity_name
+  ✓ tool_ip
+  ✓ tool_search
+  ✓ tool_shell
+  ✓ tool_file
+  ✓ tool_monitor
+  ✓ complex_analysis
+  ✓ complex_code
+  ✓ complex_multi
+  ✓ skill_prefix_react
+  ✓ skill_prefix_coder
+
+[技能召回率]
+  ✓ recall_valid_react
+  ✓ recall_valid_coder
+  ✓ recall_invalid_skill
+  ✓ recall_no_prefix_react
+  ✓ recall_no_prefix_greeting
+  ✓ recall_no_prefix_complex
+  ✓ recall_skill_only_prefix
+  ✓ recall_skill_with_long_content
+
+[过拟合检测]
+  ✓ overfit_ip_check
+  ✓ overfit_search
+  ✓ overfit_greeting
+  ✓ overfit_file_read
+  ✓ overfit_identity
+
+[执行效率]
+  ✓ efficiency_greeting
+  ✓ efficiency_chitchat
+  ✓ efficiency_identity
+  ✓ efficiency_react_tool
+  ✓ efficiency_react_complex
+
+[工具搜索准确度]
+  ✓ tool_search_read
+  ✓ tool_search_write
+  ✓ tool_search_web
+  ✓ tool_search_shell
+  ✓ tool_search_tests
+  ✓ tool_search_file_multiple
+  ✓ tool_search_no_match
+  ✓ tool_search_empty_query
+
+[事件模型完整性]
+  ✓ sq_submit_and_drain
+  ✓ eq_emit_and_subscribe
+  ✓ event_type_classification
+
+[Spec 管理功能]
+  ✓ spec_create_and_get
+  ✓ spec_confirm
+
+[验证循环]
+  ✓ verify_success
+  ✓ verify_failure
+
+----------------------------------------------------------------------
+改进建议
+----------------------------------------------------------------------
+  • 所有维度均达到 100%，架构状态良好
+
+======================================================================
\ No newline at end of file
diff --git a/tests/e2e/test_capability_comprehensive.py b/tests/e2e/test_capability_comprehensive.py
new file mode 100644
index 0000000..672fb58
--- /dev/null
+++ b/tests/e2e/test_capability_comprehensive.py
@@ -0,0 +1,1519 @@
+"""Comprehensive Capability Backtest — 综合能力回测脚本
+
+覆盖维度：
+1. 预处理准确度 (Preprocessing Accuracy) — greeting/tool/skill/complex 路由
+2. 召回率 (Recall) — @skill 前缀识别与 fallback
+3. 过拟合检测 (Overfitting Detection) — 同意图不同表达的一致性
+4. 执行效率 (Execution Efficiency) — DIRECT_CHAT/REACT 路径耗时
+5. 工具搜索准确度 (Tool Search Accuracy) — BM25 相关性排序
+6. 事件模型完整性 (Event Model Integrity) — SQ/EQ 双队列
+7. Spec 管理功能 (Spec Management) — CRUD + 确认流程
+8. 验证循环 (Verification Loop) — verify + retry
+
+设计原则：
+- 不依赖真实 LLM 调用（使用 Mock 或直接测试组件接口）
+- 可独立运行（不依赖 E2E 服务器、Redis、PostgreSQL）
+- 标记为 @pytest.mark.e2e_capability
+- 最后生成综合能力报告（JSON + 中文文本）
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+import pytest
+
+from agentkit.chat.request_preprocessor import RequestPreprocessor
+from agentkit.core.event_queue import EventQueue, Submission, SubmissionQueue
+from agentkit.core.protocol import (
+    Event,
+    SessionEventType,
+    TaskEventType,
+    TurnEventType,
+)
+from agentkit.core.spec_manager import Spec, SpecManager, SpecStep
+from agentkit.core.verification_loop import VerificationLoop
+from agentkit.skills.base import Skill, SkillConfig
+from agentkit.skills.registry import SkillRegistry
+from agentkit.tools.base import Tool
+from agentkit.tools.search import ToolSearchIndex
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 结果收集器（模块级，跨测试类共享）
+# ═══════════════════════════════════════════════════════════════════════════
+
+
+class _ResultCollector:
+    """收集所有测试结果，用于生成综合报告。"""
+
+    def __init__(self) -> None:
+        self.results: dict[str, list[dict[str, Any]]] = {}
+
+    def record(
+        self,
+        dimension: str,
+        case_id: str,
+        passed: bool,
+        **extra: Any,
+    ) -> None:
+        """记录单条测试结果。"""
+        if dimension not in self.results:
+            self.results[dimension] = []
+        entry: dict[str, Any] = {"case_id": case_id, "passed": passed}
+        entry.update(extra)
+        self.results[dimension].append(entry)
+
+    def dimension_score(self, dimension: str) -> float:
+        """计算某维度的得分（百分比）。"""
+        cases = self.results.get(dimension, [])
+        if not cases:
+            return 0.0
+        passed = sum(1 for c in cases if c["passed"])
+        return passed / len(cases) * 100
+
+    def total_score(self) -> float:
+        """计算总体得分（所有维度的平均通过率）。"""
+        all_cases: list[dict[str, Any]] = []
+        for cases in self.results.values():
+            all_cases.extend(cases)
+        if not all_cases:
+            return 0.0
+        passed = sum(1 for c in all_cases if c["passed"])
+        return passed / len(all_cases) * 100
+
+    def clear(self) -> None:
+        """清空收集器（用于报告测试中重新收集）。"""
+        self.results.clear()
+
+
+_COLLECTOR = _ResultCollector()
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 测试辅助工具
+# ═══════════════════════════════════════════════════════════════════════════
+
+
+class _FakeTool(Tool):
+    """测试用的 Fake Tool。"""
+
+    def __init__(
+        self,
+        name: str,
+        description: str,
+        input_schema: dict[str, Any] | None = None,
+        tags: list[str] | None = None,
+    ) -> None:
+        super().__init__(
+            name=name,
+            description=description,
+            input_schema=input_schema,
+            tags=tags or [],
+        )
+
+    async def execute(self, **kwargs: Any) -> dict[str, Any]:
+        return {"status": "ok"}
+
+
+def _build_test_tools() -> list[Tool]:
+    """创建一组测试工具（覆盖 io/file/web/shell/testing 场景）。"""
+    return [
+        _FakeTool(
+            name="read_file",
+            description="Read the contents of a file from the filesystem.",
+            input_schema={
+                "type": "object",
+                "properties": {
+                    "path": {"type": "string", "description": "file path to read"},
+                },
+                "required": ["path"],
+            },
+            tags=["io", "file"],
+        ),
+        _FakeTool(
+            name="write_file",
+            description="Write content to a file on the filesystem.",
+            input_schema={
+                "type": "object",
+                "properties": {
+                    "path": {"type": "string", "description": "file path to write"},
+                    "content": {"type": "string", "description": "content to write"},
+                },
+                "required": ["path", "content"],
+            },
+            tags=["io", "file"],
+        ),
+        _FakeTool(
+            name="web_search",
+            description="Search the web for information using a search engine.",
+            input_schema={
+                "type": "object",
+                "properties": {
+                    "query": {"type": "string", "description": "search query"},
+                },
+                "required": ["query"],
+            },
+            tags=["web", "search"],
+        ),
+        _FakeTool(
+            name="shell_exec",
+            description="Execute a shell command and return the output.",
+            input_schema={
+                "type": "object",
+                "properties": {
+                    "command": {
+                        "type": "string",
+                        "description": "shell command to execute",
+                    },
+                },
+                "required": ["command"],
+            },
+            tags=["shell", "system"],
+        ),
+        _FakeTool(
+            name="run_tests",
+            description="Run project tests to verify code changes.",
+            input_schema={
+                "type": "object",
+                "properties": {
+                    "commands": {
+                        "type": "array",
+                        "description": "test commands to run",
+                    },
+                },
+            },
+            tags=["testing", "verification"],
+        ),
+    ]
+
+
+def _build_mock_skill_registry() -> SkillRegistry:
+    """构建包含测试 Skill 的 SkillRegistry（不依赖真实 LLM）。"""
+    registry = SkillRegistry()
+    tools = _build_test_tools()
+
+    # react_agent skill — 使用 web_search 工具
+    react_config = SkillConfig(
+        name="react_agent",
+        agent_type="react_agent",
+        version="1.0.0",
+        description="ReAct agent skill for tool-augmented reasoning",
+        execution_mode="react",
+        prompt={
+            "identity": "You are a ReAct agent.",
+            "instructions": "Use tools to answer questions step by step.",
+        },
+    )
+    registry.register(Skill(react_config, tools=[tools[2]]))  # web_search
+
+    # coder skill — 使用 read_file + write_file 工具
+    coder_config = SkillConfig(
+        name="coder",
+        agent_type="coder",
+        version="1.0.0",
+        description="Code generation and review skill",
+        execution_mode="react",
+        prompt={
+            "identity": "You are a coding assistant.",
+            "instructions": "Help with code generation, review, and refactoring.",
+        },
+    )
+    registry.register(
+        Skill(coder_config, tools=[tools[0], tools[1]])  # read_file, write_file
+    )
+
+    return registry
+
+
+def _build_preprocessor() -> RequestPreprocessor:
+    """构建带 mock skill_registry 的 RequestPreprocessor。"""
+    return RequestPreprocessor(
+        skill_registry=_build_mock_skill_registry(),
+        default_tools=_build_test_tools(),
+        default_system_prompt="You are a helpful assistant.",
+        default_model="test-model",
+        default_agent_name="default",
+    )
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 测试数据定义
+# ═══════════════════════════════════════════════════════════════════════════
+
+# 1. 预处理准确度测试用例（≥15）
+PREPROCESSING_CASES: list[dict[str, Any]] = [
+    # Greeting/Chitchat → DIRECT_CHAT
+    {"id": "greeting_cn", "input": "你好", "expected_mode": "direct_chat"},
+    {"id": "greeting_en", "input": "hello", "expected_mode": "direct_chat"},
+    {"id": "greeting_hi", "input": "hi", "expected_mode": "direct_chat"},
+    {"id": "chitchat_thanks", "input": "谢谢", "expected_mode": "direct_chat"},
+    {"id": "chitchat_ok", "input": "好的", "expected_mode": "direct_chat"},
+    {"id": "identity_who", "input": "你是谁", "expected_mode": "direct_chat"},
+    {"id": "identity_name", "input": "你叫什么", "expected_mode": "direct_chat"},
+    # Tool-requiring queries → REACT
+    {"id": "tool_ip", "input": "查下ip", "expected_mode": "react"},
+    {"id": "tool_search", "input": "搜索golang教程", "expected_mode": "react"},
+    {"id": "tool_shell", "input": "执行ls命令", "expected_mode": "react"},
+    {"id": "tool_file", "input": "读一下配置文件", "expected_mode": "react"},
+    {"id": "tool_monitor", "input": "检查服务状态", "expected_mode": "react"},
+    # Complex queries → REACT
+    {"id": "complex_analysis", "input": "帮我分析一下这个数据并生成报告", "expected_mode": "react"},
+    {"id": "complex_code", "input": "重构这个函数使其更高效", "expected_mode": "react"},
+    {"id": "complex_multi", "input": "搜索最新的AI论文并总结关键发现", "expected_mode": "react"},
+    # @skill prefix → SKILL_REACT
+    {
+        "id": "skill_prefix_react",
+        "input": "@skill:react_agent 查看当前ip",
+        "expected_mode": "skill_react",
+    },
+    {
+        "id": "skill_prefix_coder",
+        "input": "@skill:coder 写一个函数",
+        "expected_mode": "skill_react",
+    },
+]
+
+# 2. 召回率测试用例（≥8）
+RECALL_CASES: list[dict[str, Any]] = [
+    {
+        "id": "recall_valid_react",
+        "input": "@skill:react_agent 查看ip",
+        "expected_matched": True,
+        "expected_skill": "react_agent",
+        "expected_mode": "skill_react",
+    },
+    {
+        "id": "recall_valid_coder",
+        "input": "@skill:coder 写代码",
+        "expected_matched": True,
+        "expected_skill": "coder",
+        "expected_mode": "skill_react",
+    },
+    {
+        "id": "recall_invalid_skill",
+        "input": "@skill:nonexistent 做点什么",
+        "expected_matched": False,
+        "expected_skill": None,
+        "expected_mode": "react",
+    },
+    {
+        "id": "recall_no_prefix_react",
+        "input": "查下ip地址",
+        "expected_matched": False,
+        "expected_skill": None,
+        "expected_mode": "react",
+    },
+    {
+        "id": "recall_no_prefix_greeting",
+        "input": "你好",
+        "expected_matched": False,
+        "expected_skill": None,
+        "expected_mode": "direct_chat",
+    },
+    {
+        "id": "recall_no_prefix_complex",
+        "input": "分析数据并生成报告",
+        "expected_matched": False,
+        "expected_skill": None,
+        "expected_mode": "react",
+    },
+    {
+        "id": "recall_skill_only_prefix",
+        "input": "@skill:react_agent",
+        "expected_matched": True,
+        "expected_skill": "react_agent",
+        "expected_mode": "skill_react",
+    },
+    {
+        "id": "recall_skill_with_long_content",
+        "input": "@skill:coder 请帮我重构这个函数，使其时间复杂度从 O(n²) 降到 O(n)",
+        "expected_matched": True,
+        "expected_skill": "coder",
+        "expected_mode": "skill_react",
+    },
+]
+
+# 3. 过拟合检测测试用例（≥5 组，每组原始 + 3 个改写）
+OVERFITTING_CASES: list[dict[str, Any]] = [
+    {
+        "id": "overfit_ip_check",
+        "original": "查看当前ip",
+        "paraphrases": ["查下ip", "获取ip地址", "看下ip"],
+        "expected_mode": "react",
+    },
+    {
+        "id": "overfit_search",
+        "original": "搜索golang教程",
+        "paraphrases": ["搜一下golang教程", "找下golang学习资料", "帮我搜golang入门"],
+        "expected_mode": "react",
+    },
+    {
+        "id": "overfit_greeting",
+        "original": "你好",
+        "paraphrases": ["hello", "hi", "嗨"],
+        "expected_mode": "direct_chat",
+    },
+    {
+        "id": "overfit_file_read",
+        "original": "读一下配置文件",
+        "paraphrases": ["看一下配置文件", "帮我读配置", "查看配置文件内容"],
+        "expected_mode": "react",
+    },
+    {
+        "id": "overfit_identity",
+        "original": "你是谁",
+        "paraphrases": ["你叫什么", "自我介绍", "你是什么"],
+        "expected_mode": "direct_chat",
+    },
+]
+
+# 4. 执行效率测试用例（≥5）
+EFFICIENCY_CASES: list[dict[str, Any]] = [
+    {
+        "id": "efficiency_greeting",
+        "input": "你好",
+        "expected_mode": "direct_chat",
+        "max_time_ms": 2000,
+    },
+    {
+        "id": "efficiency_chitchat",
+        "input": "谢谢",
+        "expected_mode": "direct_chat",
+        "max_time_ms": 2000,
+    },
+    {
+        "id": "efficiency_identity",
+        "input": "你是谁",
+        "expected_mode": "direct_chat",
+        "max_time_ms": 2000,
+    },
+    {
+        "id": "efficiency_react_tool",
+        "input": "查下ip",
+        "expected_mode": "react",
+        "max_time_ms": 5000,
+    },
+    {
+        "id": "efficiency_react_complex",
+        "input": "帮我分析一下这个数据并生成报告",
+        "expected_mode": "react",
+        "max_time_ms": 5000,
+    },
+]
+
+# 5. 工具搜索准确度测试用例（≥8）
+TOOL_SEARCH_CASES: list[dict[str, Any]] = [
+    {
+        "id": "tool_search_read",
+        "query": "read file",
+        "expected_top1": "read_file",
+    },
+    {
+        "id": "tool_search_write",
+        "query": "write file",
+        "expected_top1": "write_file",
+    },
+    {
+        "id": "tool_search_web",
+        "query": "web search",
+        "expected_top1": "web_search",
+    },
+    {
+        "id": "tool_search_shell",
+        "query": "shell command execute",
+        "expected_top1": "shell_exec",
+    },
+    {
+        "id": "tool_search_tests",
+        "query": "run tests verify",
+        "expected_top1": "run_tests",
+    },
+    {
+        "id": "tool_search_file_multiple",
+        "query": "file",
+        "expected_contains": ["read_file", "write_file"],
+    },
+    {
+        "id": "tool_search_no_match",
+        "query": "xyzzy_nonexistent_xyz",
+        "expected_empty": True,
+    },
+    {
+        "id": "tool_search_empty_query",
+        "query": "",
+        "expected_empty": True,
+    },
+]
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 1. 预处理准确度
+# ═══════════════════════════════════════════════════════════════════════════
+
+
+@pytest.mark.e2e_capability
+class TestPreprocessingAccuracy:
+    """预处理准确度测试：验证 RequestPreprocessor 的路由决策。
+
+    覆盖场景：
+    - Greeting/Chitchat → DIRECT_CHAT（零成本快速路径）
+    - Tool-requiring queries → REACT（LLM 决定工具使用）
+    - @skill prefix → SKILL_REACT（显式技能选择）
+    - Complex queries → REACT（默认 agent 循环）
+    """
+
+    @pytest.mark.parametrize(
+        "case",
+        PREPROCESSING_CASES,
+        ids=[c["id"] for c in PREPROCESSING_CASES],
+    )
+    def test_preprocessing_routing(self, case: dict[str, Any]) -> None:
+        """验证每个输入被路由到正确的执行模式。"""
+        preprocessor = _build_preprocessor()
+        result = asyncio.run(preprocessor.preprocess(content=case["input"]))
+
+        actual_mode = result.execution_mode.value
+        expected_mode = case["expected_mode"]
+        passed = actual_mode == expected_mode
+
+        _COLLECTOR.record(
+            dimension="preprocessing_accuracy",
+            case_id=case["id"],
+            passed=passed,
+            input=case["input"],
+            expected=expected_mode,
+            actual=actual_mode,
+            match_method=result.match_method,
+        )
+
+        assert actual_mode == expected_mode, (
+            f"'{case['input']}': expected {expected_mode}, got {actual_mode} "
+            f"(method={result.match_method})"
+        )
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 2. 召回率 — 技能匹配
+# ═══════════════════════════════════════════════════════════════════════════
+
+
+@pytest.mark.e2e_capability
+class TestSkillRecall:
+    """技能召回率测试：验证 @skill 前缀识别和 fallback 机制。
+
+    覆盖场景：
+    - 有效 @skill 前缀 → 正确匹配技能
+    - 无效 @skill 前缀 → fallback 到 REACT
+    - 无前缀 → 默认 REACT 或 DIRECT_CHAT
+    """
+
+    @pytest.mark.parametrize(
+        "case",
+        RECALL_CASES,
+        ids=[c["id"] for c in RECALL_CASES],
+    )
+    def test_skill_recall(self, case: dict[str, Any]) -> None:
+        """验证 @skill 前缀的召回和 fallback 行为。"""
+        preprocessor = _build_preprocessor()
+        result = asyncio.run(preprocessor.preprocess(content=case["input"]))
+
+        actual_matched = result.matched
+        actual_skill = result.skill_name
+        actual_mode = result.execution_mode.value
+        expected_matched = case["expected_matched"]
+        expected_skill = case["expected_skill"]
+        expected_mode = case["expected_mode"]
+
+        passed = (
+            actual_matched == expected_matched
+            and actual_skill == expected_skill
+            and actual_mode == expected_mode
+        )
+
+        _COLLECTOR.record(
+            dimension="skill_recall",
+            case_id=case["id"],
+            passed=passed,
+            input=case["input"],
+            expected_matched=expected_matched,
+            actual_matched=actual_matched,
+            expected_skill=expected_skill,
+            actual_skill=actual_skill,
+            expected_mode=expected_mode,
+            actual_mode=actual_mode,
+        )
+
+        assert actual_matched == expected_matched, (
+            f"'{case['input']}': matched expected {expected_matched}, got {actual_matched}"
+        )
+        assert actual_skill == expected_skill, (
+            f"'{case['input']}': skill expected {expected_skill}, got {actual_skill}"
+        )
+        assert actual_mode == expected_mode, (
+            f"'{case['input']}': mode expected {expected_mode}, got {actual_mode}"
+        )
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 3. 过拟合检测
+# ═══════════════════════════════════════════════════════════════════════════
+
+
+@pytest.mark.e2e_capability
+class TestOverfittingDetection:
+    """过拟合检测：验证同一意图的不同表达方式产生一致的执行模式。
+
+    核心思路：如果路由器对 "查看当前ip" 和 "查下ip" 给出不同的执行模式，
+    说明路由器对特定表达过拟合，泛化能力不足。
+    """
+
+    @pytest.mark.parametrize(
+        "case",
+        OVERFITTING_CASES,
+        ids=[c["id"] for c in OVERFITTING_CASES],
+    )
+    def test_paraphrase_consistency(self, case: dict[str, Any]) -> None:
+        """验证原始输入和改写输入产生相同的执行模式。"""
+        preprocessor = _build_preprocessor()
+        expected_mode = case["expected_mode"]
+
+        # 测试原始输入
+        original_result = asyncio.run(preprocessor.preprocess(content=case["original"]))
+        original_mode = original_result.execution_mode.value
+
+        # 测试所有改写
+        paraphrase_modes: list[str] = []
+        for para in case["paraphrases"]:
+            result = asyncio.run(preprocessor.preprocess(content=para))
+            paraphrase_modes.append(result.execution_mode.value)
+
+        all_modes = [original_mode] + paraphrase_modes
+        all_consistent = all(m == expected_mode for m in all_modes)
+
+        _COLLECTOR.record(
+            dimension="overfitting_detection",
+            case_id=case["id"],
+            passed=all_consistent,
+            original=case["original"],
+            original_mode=original_mode,
+            paraphrases=case["paraphrases"],
+            paraphrase_modes=paraphrase_modes,
+            expected_mode=expected_mode,
+        )
+
+        assert all_consistent, (
+            f"Overfitting detected for '{case['id']}': "
+            f"original='{case['original']}' → {original_mode}, "
+            f"paraphrases={case['paraphrases']} → {paraphrase_modes}, "
+            f"expected={expected_mode}"
+        )
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 4. 执行效率
+# ═══════════════════════════════════════════════════════════════════════════
+
+
+@pytest.mark.e2e_capability
+class TestExecutionEfficiency:
+    """执行效率测试：测量预处理阶段的响应时间。
+
+    约束：
+    - DIRECT_CHAT 路径：< 2s（零成本快速路径）
+    - REACT 路径启动：< 5s（预处理阶段，不含 LLM 调用）
+    """
+
+    @pytest.mark.parametrize(
+        "case",
+        EFFICIENCY_CASES,
+        ids=[c["id"] for c in EFFICIENCY_CASES],
+    )
+    def test_preprocessing_latency(self, case: dict[str, Any]) -> None:
+        """验证预处理阶段耗时在阈值范围内。"""
+        preprocessor = _build_preprocessor()
+
+        start = time.perf_counter()
+        result = asyncio.run(preprocessor.preprocess(content=case["input"]))
+        elapsed_ms = (time.perf_counter() - start) * 1000
+
+        actual_mode = result.execution_mode.value
+        max_time_ms = case["max_time_ms"]
+        passed = elapsed_ms < max_time_ms and actual_mode == case["expected_mode"]
+
+        _COLLECTOR.record(
+            dimension="execution_efficiency",
+            case_id=case["id"],
+            passed=passed,
+            input=case["input"],
+            elapsed_ms=round(elapsed_ms, 2),
+            max_time_ms=max_time_ms,
+            actual_mode=actual_mode,
+            expected_mode=case["expected_mode"],
+        )
+
+        assert elapsed_ms < max_time_ms, (
+            f"'{case['input']}': elapsed {elapsed_ms:.1f}ms > limit {max_time_ms}ms"
+        )
+        assert actual_mode == case["expected_mode"], (
+            f"'{case['input']}': mode expected {case['expected_mode']}, got {actual_mode}"
+        )
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 5. 工具搜索准确度
+# ═══════════════════════════════════════════════════════════════════════════
+
+
+@pytest.mark.e2e_capability
+class TestToolSearchAccuracy:
+    """工具搜索准确度测试：验证 BM25 搜索的相关性排序。
+
+    覆盖场景：
+    - 精确匹配工具名/描述
+    - 模糊匹配关键词
+    - 无匹配返回空
+    - 空查询返回空
+    """
+
+    @pytest.mark.parametrize(
+        "case",
+        TOOL_SEARCH_CASES,
+        ids=[c["id"] for c in TOOL_SEARCH_CASES],
+    )
+    def test_tool_search(self, case: dict[str, Any]) -> None:
+        """验证工具搜索返回正确的结果。"""
+        index = ToolSearchIndex(_build_test_tools())
+        results = index.search(case["query"], top_k=5)
+        result_names = [r.name for r in results]
+
+        passed = False
+        detail: dict[str, Any] = {"query": case["query"], "results": result_names}
+
+        if case.get("expected_empty"):
+            passed = len(results) == 0
+            detail["expected_empty"] = True
+        elif "expected_top1" in case:
+            passed = len(results) > 0 and results[0].name == case["expected_top1"]
+            detail["expected_top1"] = case["expected_top1"]
+        elif "expected_contains" in case:
+            expected = case["expected_contains"]
+            passed = all(name in result_names for name in expected)
+            detail["expected_contains"] = expected
+
+        _COLLECTOR.record(
+            dimension="tool_search_accuracy",
+            case_id=case["id"],
+            passed=passed,
+            **detail,
+        )
+
+        if case.get("expected_empty"):
+            assert len(results) == 0, f"Query '{case['query']}': expected empty, got {result_names}"
+        elif "expected_top1" in case:
+            assert len(results) > 0, f"Query '{case['query']}': no results"
+            assert results[0].name == case["expected_top1"], (
+                f"Query '{case['query']}': expected top1={case['expected_top1']}, "
+                f"got {results[0].name}"
+            )
+        elif "expected_contains" in case:
+            for name in case["expected_contains"]:
+                assert name in result_names, (
+                    f"Query '{case['query']}': expected '{name}' in results, got {result_names}"
+                )
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 6. 事件模型完整性
+# ═══════════════════════════════════════════════════════════════════════════
+
+
+@pytest.mark.e2e_capability
+class TestEventModelIntegrity:
+    """事件模型完整性测试：验证 SQ/EQ 双队列的基本功能。
+
+    覆盖场景：
+    - SQ 提交和消费
+    - SQ 取消任务
+    - EQ 事件推送和订阅
+    - EQ 多订阅者广播
+    - EQ 缓冲回放
+    - 事件类型分类
+    """
+
+    async def test_sq_submit_and_drain(self) -> None:
+        """SQ 正确接收用户输入并按顺序消费。"""
+        sq = SubmissionQueue()
+        task_id = await sq.submit("hello", "session-1")
+
+        received: list[Submission] = []
+
+        async def consumer() -> None:
+            async for sub in sq.drain():
+                received.append(sub)
+                if len(received) >= 1:
+                    break
+
+        consumer_task = asyncio.create_task(consumer())
+        await asyncio.wait_for(consumer_task, timeout=1.0)
+
+        passed = len(received) == 1 and received[0].content == "hello"
+        _COLLECTOR.record(
+            dimension="event_model_integrity",
+            case_id="sq_submit_and_drain",
+            passed=passed,
+        )
+        assert passed
+        assert received[0].task_id == task_id
+
+    async def test_sq_cancel_task(self) -> None:
+        """SQ 取消任务后 drain 跳过该提交。"""
+        sq = SubmissionQueue()
+        task_id_1 = await sq.submit("first", "session-1")
+        await sq.submit("second", "session-1")
+        await sq.cancel(task_id_1)
+
+        received: list[str] = []
+
+        async def consumer() -> None:
+            async for sub in sq.drain():
+                received.append(sub.content)
+                if len(received) >= 1:
+                    break
+
+        consumer_task = asyncio.create_task(consumer())
+        await asyncio.wait_for(consumer_task, timeout=1.0)
+
+        passed = received == ["second"]
+        _COLLECTOR.record(
+            dimension="event_model_integrity",
+            case_id="sq_cancel_task",
+            passed=passed,
+        )
+        assert passed
+
+    async def test_eq_emit_and_subscribe(self) -> None:
+        """EQ 正确推送事件给订阅者。"""
+        eq = EventQueue()
+        event = Event.create(
+            event_type=TurnEventType.TOKEN,
+            task_id="task-1",
+            session_id="session-1",
+            data={"text": "hello"},
+        )
+
+        received: list[Event] = []
+
+        async def subscriber() -> None:
+            async for evt in eq.subscribe():
+                received.append(evt)
+                break
+
+        sub_task = asyncio.create_task(subscriber())
+        await asyncio.sleep(0.05)
+        await eq.emit(event)
+        await asyncio.wait_for(sub_task, timeout=1.0)
+
+        passed = (
+            len(received) == 1
+            and received[0].event_type == TurnEventType.TOKEN
+            and received[0].data == {"text": "hello"}
+        )
+        _COLLECTOR.record(
+            dimension="event_model_integrity",
+            case_id="eq_emit_and_subscribe",
+            passed=passed,
+        )
+        assert passed
+
+    async def test_eq_broadcast_to_multiple_subscribers(self) -> None:
+        """EQ 多订阅者同时接收事件（广播）。"""
+        eq = EventQueue()
+        received_a: list[Event] = []
+        received_b: list[Event] = []
+
+        async def subscriber_a() -> None:
+            async for evt in eq.subscribe():
+                received_a.append(evt)
+                if len(received_a) >= 2:
+                    break
+
+        async def subscriber_b() -> None:
+            async for evt in eq.subscribe():
+                received_b.append(evt)
+                if len(received_b) >= 2:
+                    break
+
+        task_a = asyncio.create_task(subscriber_a())
+        task_b = asyncio.create_task(subscriber_b())
+        await asyncio.sleep(0.05)
+
+        await eq.emit(Event.create(TurnEventType.TOKEN, "t1", "s1", {"seq": 1}))
+        await eq.emit(Event.create(TurnEventType.TOKEN, "t1", "s1", {"seq": 2}))
+
+        await asyncio.wait_for(task_a, timeout=1.0)
+        await asyncio.wait_for(task_b, timeout=1.0)
+
+        passed = len(received_a) == 2 and len(received_b) == 2
+        _COLLECTOR.record(
+            dimension="event_model_integrity",
+            case_id="eq_broadcast",
+            passed=passed,
+        )
+        assert passed
+
+    async def test_eq_buffer_replay(self) -> None:
+        """EQ 事件缓冲对新订阅者的回放。"""
+        eq = EventQueue(buffer_size=100)
+
+        await eq.emit(Event.create(TurnEventType.TOKEN, "t1", "s1", {"seq": 1}))
+        await eq.emit(Event.create(TurnEventType.TOKEN, "t1", "s1", {"seq": 2}))
+
+        received: list[Event] = []
+
+        async def subscriber() -> None:
+            async for evt in eq.subscribe():
+                received.append(evt)
+                if len(received) >= 2:
+                    break
+
+        sub_task = asyncio.create_task(subscriber())
+        await asyncio.wait_for(sub_task, timeout=1.0)
+
+        passed = (
+            len(received) == 2 and received[0].data == {"seq": 1} and received[1].data == {"seq": 2}
+        )
+        _COLLECTOR.record(
+            dimension="event_model_integrity",
+            case_id="eq_buffer_replay",
+            passed=passed,
+        )
+        assert passed
+
+    def test_event_type_classification(self) -> None:
+        """事件类型按前缀正确分类（session/task/turn）。"""
+        session_events = [
+            SessionEventType.SESSION_STARTED,
+            SessionEventType.SESSION_ENDED,
+        ]
+        task_events = [
+            TaskEventType.TASK_CREATED,
+            TaskEventType.TASK_STARTED,
+            TaskEventType.TASK_COMPLETED,
+            TaskEventType.TASK_FAILED,
+        ]
+        turn_events = [
+            TurnEventType.TURN_STARTED,
+            TurnEventType.THINKING,
+            TurnEventType.TOOL_CALL,
+            TurnEventType.TOKEN,
+            TurnEventType.FINAL_ANSWER,
+        ]
+
+        all_correct = (
+            all(e.startswith("session.") for e in session_events)
+            and all(e.startswith("task.") for e in task_events)
+            and all(e.startswith("turn.") for e in turn_events)
+        )
+
+        _COLLECTOR.record(
+            dimension="event_model_integrity",
+            case_id="event_type_classification",
+            passed=all_correct,
+        )
+        assert all_correct
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 7. Spec 管理功能
+# ═══════════════════════════════════════════════════════════════════════════
+
+
+@pytest.mark.e2e_capability
+class TestSpecManagement:
+    """Spec 管理功能测试：验证 Spec 的创建/读取/更新/确认流程。
+
+    覆盖场景：
+    - 创建 Spec 并持久化到 YAML
+    - 读取 Spec（缓存 + 磁盘）
+    - 更新 Spec 字段
+    - 确认 Spec（状态 + 时间戳 + 步骤）
+    - 删除 Spec
+    """
+
+    def test_spec_create_and_get(self, tmp_path: Path) -> None:
+        """创建 Spec 并读取，验证字段完整。"""
+        mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
+        spec = Spec(
+            spec_id="test-spec-1",
+            goal="Test goal",
+            steps=[
+                SpecStep(step_id="s1", name="Step 1", description="First"),
+                SpecStep(
+                    step_id="s2",
+                    name="Step 2",
+                    description="Second",
+                    dependencies=["s1"],
+                ),
+            ],
+        )
+        path = mgr.create(spec)
+        loaded = mgr.get("test-spec-1")
+
+        passed = (
+            path.exists()
+            and loaded is not None
+            and loaded.spec_id == "test-spec-1"
+            and loaded.goal == "Test goal"
+            and len(loaded.steps) == 2
+            and loaded.steps[1].dependencies == ["s1"]
+        )
+        _COLLECTOR.record(
+            dimension="spec_management",
+            case_id="spec_create_and_get",
+            passed=passed,
+        )
+        assert passed
+
+    def test_spec_update(self, tmp_path: Path) -> None:
+        """更新 Spec 字段并持久化。"""
+        mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
+        spec = Spec(spec_id="test-spec-2", goal="Original")
+        mgr.create(spec)
+
+        updated = mgr.update("test-spec-2", goal="Updated goal")
+        reloaded = mgr.get("test-spec-2")
+
+        passed = (
+            updated is not None
+            and updated.goal == "Updated goal"
+            and reloaded is not None
+            and reloaded.goal == "Updated goal"
+        )
+        _COLLECTOR.record(
+            dimension="spec_management",
+            case_id="spec_update",
+            passed=passed,
+        )
+        assert passed
+
+    def test_spec_confirm(self, tmp_path: Path) -> None:
+        """确认 Spec 后状态和步骤状态正确变更。"""
+        mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
+        spec = Spec(
+            spec_id="test-spec-3",
+            goal="Confirm test",
+            steps=[
+                SpecStep(step_id="s1", name="Step 1", description="First"),
+            ],
+        )
+        mgr.create(spec)
+
+        confirmed = mgr.confirm("test-spec-3")
+
+        passed = (
+            confirmed is not None
+            and confirmed.status == "confirmed"
+            and confirmed.confirmed_at is not None
+            and all(s.status == "confirmed" for s in confirmed.steps)
+        )
+        _COLLECTOR.record(
+            dimension="spec_management",
+            case_id="spec_confirm",
+            passed=passed,
+        )
+        assert passed
+
+    def test_spec_list_and_filter(self, tmp_path: Path) -> None:
+        """列出 Spec 并按状态过滤。"""
+        mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
+        mgr.create(Spec(spec_id="draft-1", goal="Draft"))
+        mgr.create(Spec(spec_id="confirmed-1", goal="Confirmed"))
+        mgr.confirm("confirmed-1")
+
+        all_specs = mgr.list_specs()
+        draft_specs = mgr.list_specs(status="draft")
+        confirmed_specs = mgr.list_specs(status="confirmed")
+
+        passed = (
+            len(all_specs) == 2
+            and len(draft_specs) == 1
+            and len(confirmed_specs) == 1
+            and confirmed_specs[0].spec_id == "confirmed-1"
+        )
+        _COLLECTOR.record(
+            dimension="spec_management",
+            case_id="spec_list_and_filter",
+            passed=passed,
+        )
+        assert passed
+
+    def test_spec_delete(self, tmp_path: Path) -> None:
+        """删除 Spec 后文件和缓存均移除。"""
+        mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
+        mgr.create(Spec(spec_id="delete-me", goal="To be deleted"))
+
+        result = mgr.delete("delete-me")
+        loaded = mgr.get("delete-me")
+
+        passed = result is True and loaded is None
+        _COLLECTOR.record(
+            dimension="spec_management",
+            case_id="spec_delete",
+            passed=passed,
+        )
+        assert passed
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 8. 验证循环
+# ═══════════════════════════════════════════════════════════════════════════
+
+
+@pytest.mark.e2e_capability
+class TestVerificationLoop:
+    """验证循环测试：验证 VerificationLoop 的 verify 和 retry 机制。
+
+    覆盖场景：
+    - 成功命令返回 passed=True
+    - 失败命令返回 passed=False
+    - 超时命令返回 passed=False
+    - 重试机制（无 fix_callback / 有 fix_callback）
+    """
+
+    async def test_verify_success(self) -> None:
+        """成功命令返回 passed=True。"""
+        loop = VerificationLoop(commands=["echo ok"], timeout=10.0)
+        result = await loop.verify()
+
+        passed = result.passed is True and "ok" in result.test_output
+        _COLLECTOR.record(
+            dimension="verification_loop",
+            case_id="verify_success",
+            passed=passed,
+        )
+        assert passed
+
+    async def test_verify_failure(self) -> None:
+        """失败命令返回 passed=False。"""
+        loop = VerificationLoop(commands=["false"], timeout=10.0)
+        result = await loop.verify()
+
+        passed = result.passed is False and len(result.errors) > 0
+        _COLLECTOR.record(
+            dimension="verification_loop",
+            case_id="verify_failure",
+            passed=passed,
+        )
+        assert passed
+
+    async def test_verify_timeout(self) -> None:
+        """超时命令返回 passed=False。"""
+        loop = VerificationLoop(commands=["sleep 10"], timeout=0.5)
+        result = await loop.verify()
+
+        passed = result.passed is False and any("timed out" in e for e in result.errors)
+        _COLLECTOR.record(
+            dimension="verification_loop",
+            case_id="verify_timeout",
+            passed=passed,
+        )
+        assert passed
+
+    async def test_verify_and_retry_no_callback(self) -> None:
+        """无 fix_callback 时重试指定次数。"""
+        loop = VerificationLoop(commands=["false"], max_retries=2, timeout=5.0)
+        result = await loop.verify_and_retry()
+
+        passed = result.passed is False and result.attempts == 3
+        _COLLECTOR.record(
+            dimension="verification_loop",
+            case_id="verify_and_retry_no_callback",
+            passed=passed,
+        )
+        assert passed
+
+    async def test_verify_and_retry_with_callback(self) -> None:
+        """fix_callback 被调用并接收 errors 和 test_output。"""
+        call_count = 0
+
+        async def fix_cb(errors: list[str], test_output: str) -> None:
+            nonlocal call_count
+            call_count += 1
+
+        loop = VerificationLoop(commands=["false"], max_retries=1, timeout=5.0)
+        result = await loop.verify_and_retry(fix_callback=fix_cb)
+
+        passed = result.passed is False and call_count == 1
+        _COLLECTOR.record(
+            dimension="verification_loop",
+            case_id="verify_and_retry_with_callback",
+            passed=passed,
+        )
+        assert passed
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# 9. 综合报告生成
+# ═══════════════════════════════════════════════════════════════════════════
+
+
+def _run_all_checks_for_report() -> _ResultCollector:
+    """运行所有维度的检查，返回填充好的收集器（用于报告生成）。
+
+    这确保报告测试自包含，不依赖其他测试的执行顺序。
+    """
+    collector = _ResultCollector()
+    preprocessor = _build_preprocessor()
+    tools = _build_test_tools()
+    search_index = ToolSearchIndex(tools)
+
+    # --- 1. 预处理准确度 ---
+    for case in PREPROCESSING_CASES:
+        result = asyncio.run(preprocessor.preprocess(content=case["input"]))
+        actual = result.execution_mode.value
+        passed = actual == case["expected_mode"]
+        collector.record(
+            dimension="preprocessing_accuracy",
+            case_id=case["id"],
+            passed=passed,
+            input=case["input"],
+            expected=case["expected_mode"],
+            actual=actual,
+        )
+
+    # --- 2. 召回率 ---
+    for case in RECALL_CASES:
+        result = asyncio.run(preprocessor.preprocess(content=case["input"]))
+        passed = (
+            result.matched == case["expected_matched"]
+            and result.skill_name == case["expected_skill"]
+            and result.execution_mode.value == case["expected_mode"]
+        )
+        collector.record(
+            dimension="skill_recall",
+            case_id=case["id"],
+            passed=passed,
+        )
+
+    # --- 3. 过拟合检测 ---
+    for case in OVERFITTING_CASES:
+        original_result = asyncio.run(preprocessor.preprocess(content=case["original"]))
+        modes = [original_result.execution_mode.value]
+        for para in case["paraphrases"]:
+            r = asyncio.run(preprocessor.preprocess(content=para))
+            modes.append(r.execution_mode.value)
+        passed = all(m == case["expected_mode"] for m in modes)
+        collector.record(
+            dimension="overfitting_detection",
+            case_id=case["id"],
+            passed=passed,
+        )
+
+    # --- 4. 执行效率 ---
+    for case in EFFICIENCY_CASES:
+        start = time.perf_counter()
+        result = asyncio.run(preprocessor.preprocess(content=case["input"]))
+        elapsed_ms = (time.perf_counter() - start) * 1000
+        passed = (
+            elapsed_ms < case["max_time_ms"]
+            and result.execution_mode.value == case["expected_mode"]
+        )
+        collector.record(
+            dimension="execution_efficiency",
+            case_id=case["id"],
+            passed=passed,
+            elapsed_ms=round(elapsed_ms, 2),
+        )
+
+    # --- 5. 工具搜索准确度 ---
+    for case in TOOL_SEARCH_CASES:
+        results = search_index.search(case["query"], top_k=5)
+        names = [r.name for r in results]
+        if case.get("expected_empty"):
+            passed = len(results) == 0
+        elif "expected_top1" in case:
+            passed = len(results) > 0 and results[0].name == case["expected_top1"]
+        elif "expected_contains" in case:
+            passed = all(n in names for n in case["expected_contains"])
+        else:
+            passed = False
+        collector.record(
+            dimension="tool_search_accuracy",
+            case_id=case["id"],
+            passed=passed,
+        )
+
+    # --- 6. 事件模型完整性 ---
+    async def _run_event_checks() -> None:
+        # SQ submit + drain
+        sq = SubmissionQueue()
+        await sq.submit("test", "s1")
+        sq_received: list[Submission] = []
+
+        async def sq_consumer() -> None:
+            async for sub in sq.drain():
+                sq_received.append(sub)
+                break
+
+        sq_task = asyncio.create_task(sq_consumer())
+        await asyncio.wait_for(sq_task, timeout=1.0)
+        collector.record(
+            dimension="event_model_integrity",
+            case_id="sq_submit_and_drain",
+            passed=len(sq_received) == 1,
+        )
+
+        # EQ emit + subscribe
+        eq = EventQueue()
+        event = Event.create(TurnEventType.TOKEN, "t1", "s1", {"text": "hi"})
+        eq_received: list[Event] = []
+
+        async def eq_sub() -> None:
+            async for evt in eq.subscribe():
+                eq_received.append(evt)
+                break
+
+        eq_task = asyncio.create_task(eq_sub())
+        await asyncio.sleep(0.05)
+        await eq.emit(event)
+        await asyncio.wait_for(eq_task, timeout=1.0)
+        collector.record(
+            dimension="event_model_integrity",
+            case_id="eq_emit_and_subscribe",
+            passed=len(eq_received) == 1,
+        )
+
+    asyncio.run(_run_event_checks())
+
+    # 事件类型分类
+    type_ok = (
+        SessionEventType.SESSION_STARTED.startswith("session.")
+        and TaskEventType.TASK_STARTED.startswith("task.")
+        and TurnEventType.TOKEN.startswith("turn.")
+    )
+    collector.record(
+        dimension="event_model_integrity",
+        case_id="event_type_classification",
+        passed=type_ok,
+    )
+
+    # --- 7. Spec 管理 ---
+    import tempfile
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        mgr = SpecManager(specs_dir=tmpdir)
+        spec = Spec(
+            spec_id="report-spec",
+            goal="Report test",
+            steps=[SpecStep(step_id="s1", name="S1", description="Step 1")],
+        )
+        mgr.create(spec)
+        loaded = mgr.get("report-spec")
+        collector.record(
+            dimension="spec_management",
+            case_id="spec_create_and_get",
+            passed=loaded is not None and loaded.goal == "Report test",
+        )
+
+        confirmed = mgr.confirm("report-spec")
+        collector.record(
+            dimension="spec_management",
+            case_id="spec_confirm",
+            passed=confirmed is not None and confirmed.status == "confirmed",
+        )
+
+    # --- 8. 验证循环 ---
+    async def _run_verification_checks() -> None:
+        loop_ok = VerificationLoop(commands=["echo ok"], timeout=10.0)
+        result_ok = await loop_ok.verify()
+        collector.record(
+            dimension="verification_loop",
+            case_id="verify_success",
+            passed=result_ok.passed is True,
+        )
+
+        loop_fail = VerificationLoop(commands=["false"], timeout=5.0)
+        result_fail = await loop_fail.verify()
+        collector.record(
+            dimension="verification_loop",
+            case_id="verify_failure",
+            passed=result_fail.passed is False,
+        )
+
+    asyncio.run(_run_verification_checks())
+
+    return collector
+
+
+def _generate_json_report(collector: _ResultCollector) -> dict[str, Any]:
+    """生成 JSON 格式的综合报告。"""
+    dimensions = [
+        "preprocessing_accuracy",
+        "skill_recall",
+        "overfitting_detection",
+        "execution_efficiency",
+        "tool_search_accuracy",
+        "event_model_integrity",
+        "spec_management",
+        "verification_loop",
+    ]
+
+    dimension_scores: dict[str, float] = {}
+    dimension_details: dict[str, Any] = {}
+    for dim in dimensions:
+        score = collector.dimension_score(dim)
+        dimension_scores[dim] = round(score, 1)
+        dimension_details[dim] = {
+            "total": len(collector.results.get(dim, [])),
+            "passed": sum(1 for c in collector.results.get(dim, []) if c["passed"]),
+            "score": round(score, 1),
+            "cases": collector.results.get(dim, []),
+        }
+
+    total_score = collector.total_score()
+
+    # 改进建议
+    suggestions: list[str] = []
+    for dim, score in dimension_scores.items():
+        if score < 100:
+            suggestions.append(f"[{dim}] 得分 {score:.1f}%，存在失败用例，需检查相关组件")
+    if not suggestions:
+        suggestions.append("所有维度均达到 100%，架构状态良好")
+
+    return {
+        "report_type": "comprehensive_capability_backtest",
+        "generated_at": datetime.now(timezone.utc).isoformat(),
+        "total_score": round(total_score, 1),
+        "total_cases": sum(len(cases) for cases in collector.results.values()),
+        "total_passed": sum(
+            1 for cases in collector.results.values() for c in cases if c["passed"]
+        ),
+        "dimension_scores": dimension_scores,
+        "dimension_details": dimension_details,
+        "suggestions": suggestions,
+    }
+
+
+def _generate_text_report(json_report: dict[str, Any]) -> str:
+    """生成中文文本格式的综合报告。"""
+    lines: list[str] = []
+    sep = "=" * 70
+
+    lines.append(sep)
+    lines.append("Fischer AgentKit 综合能力回测报告")
+    lines.append(sep)
+    lines.append(f"生成时间: {json_report['generated_at']}")
+    lines.append(f"总体评分: {json_report['total_score']:.1f}%")
+    lines.append(
+        f"用例总数: {json_report['total_cases']}  "
+        f"通过: {json_report['total_passed']}  "
+        f"失败: {json_report['total_cases'] - json_report['total_passed']}"
+    )
+    lines.append("")
+
+    # 各维度得分
+    lines.append("-" * 70)
+    lines.append("各维度得分")
+    lines.append("-" * 70)
+    dim_names: dict[str, str] = {
+        "preprocessing_accuracy": "预处理准确度",
+        "skill_recall": "技能召回率",
+        "overfitting_detection": "过拟合检测",
+        "execution_efficiency": "执行效率",
+        "tool_search_accuracy": "工具搜索准确度",
+        "event_model_integrity": "事件模型完整性",
+        "spec_management": "Spec 管理功能",
+        "verification_loop": "验证循环",
+    }
+    for dim, score in json_report["dimension_scores"].items():
+        name = dim_names.get(dim, dim)
+        detail = json_report["dimension_details"][dim]
+        status = "✓" if score == 100 else "✗"
+        lines.append(f"  {status} {name}: {score:.1f}% ({detail['passed']}/{detail['total']})")
+    lines.append("")
+
+    # 详细用例结果
+    lines.append("-" * 70)
+    lines.append("详细用例结果")
+    lines.append("-" * 70)
+    for dim, details in json_report["dimension_details"].items():
+        name = dim_names.get(dim, dim)
+        lines.append(f"\n[{name}]")
+        for case in details["cases"]:
+            status = "✓" if case["passed"] else "✗"
+            lines.append(f"  {status} {case['case_id']}")
+    lines.append("")
+
+    # 改进建议
+    lines.append("-" * 70)
+    lines.append("改进建议")
+    lines.append("-" * 70)
+    for suggestion in json_report["suggestions"]:
+        lines.append(f"  • {suggestion}")
+    lines.append("")
+    lines.append(sep)
+
+    return "\n".join(lines)
+
+
+@pytest.mark.e2e_capability
+class TestComprehensiveReport:
+    """综合报告生成测试：在所有测试完成后生成综合能力报告。
+
+    输出：
+    - JSON 报告: test-results/e2e/comprehensive_report.json
+    - 文本报告: test-results/e2e/comprehensive_report.txt
+    """
+
+    def test_generate_comprehensive_report(self, tmp_path: Path) -> None:
+        """运行所有维度的检查并生成综合报告。"""
+        # 自包含运行所有检查（不依赖其他测试的执行顺序）
+        collector = _run_all_checks_for_report()
+
+        # 合并已有收集器结果（如果其他测试已运行）
+        for dim, cases in _COLLECTOR.results.items():
+            if dim not in collector.results:
+                collector.results[dim] = cases
+
+        # 生成报告
+        json_report = _generate_json_report(collector)
+        text_report = _generate_text_report(json_report)
+
+        # 确保输出目录存在
+        output_dir = Path("test-results/e2e")
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        # 保存 JSON 报告
+        json_path = output_dir / "comprehensive_report.json"
+        json_path.write_text(
+            json.dumps(json_report, ensure_ascii=False, indent=2),
+            encoding="utf-8",
+        )
+
+        # 保存文本报告
+        text_path = output_dir / "comprehensive_report.txt"
+        text_path.write_text(text_report, encoding="utf-8")
+
+        # 打印报告到控制台
+        print(f"\n{text_report}")
+        print(f"\nJSON 报告: {json_path}")
+        print(f"文本报告: {text_path}")
+
+        # 验证报告文件已生成
+        assert json_path.exists(), "JSON report file not generated"
+        assert text_path.exists(), "Text report file not generated"
+
+        # 验证报告内容完整
+        assert json_report["total_cases"] > 0, "No test cases in report"
+        assert len(json_report["dimension_scores"]) == 8, "Expected 8 dimensions in report"
+
+        # 验证总体通过率不低于阈值（允许部分用例失败，但总体应 > 80%）
+        total_score = json_report["total_score"]
+        print(f"\n总体评分: {total_score:.1f}%")
+        assert total_score >= 80.0, f"Total score {total_score:.1f}% is below 80% threshold"
diff --git a/tests/e2e/test_capability_router_direct.py b/tests/e2e/test_capability_router_direct.py
deleted file mode 100644
index 0536d00..0000000
--- a/tests/e2e/test_capability_router_direct.py
+++ /dev/null
@@ -1,405 +0,0 @@
-"""E2E Agent Capability Tests — Router Direct Backtest Layer (Real LLM).
-
-Directly tests CostAwareRouter.route() using real LLM configuration
-loaded from agentkit.yaml. Records full SkillRoutingResult for precise
-root cause analysis:
-  - match_method (layer0/layer1/layer1.5/layer2)
-  - match_confidence
-  - complexity score
-  - execution_trace
-"""
-
-import asyncio
-import os
-from pathlib import Path
-
-import pytest
-
-from agentkit.chat.skill_routing import CostAwareRouter
-from agentkit.router.intent import IntentRouter
-from agentkit.server.app import _build_llm_gateway, _build_skill_registry
-from agentkit.server.config import ServerConfig
-from agentkit.skills.registry import SkillRegistry
-
-from tests.e2e.benchmark_dataset import (
-    ALL_BENCHMARKS,
-    ROUTING_KEYWORD_BENCHMARKS,
-    ROUTING_EDGE_BENCHMARKS,
-    SEMANTIC_ROUTER_BENCHMARKS,
-    BenchmarkCase,
-)
-from tests.e2e.capability_metrics import MetricsCollector
-
-
-# ═══════════════════════════════════════════════════════════════════════════
-# Real component initialization from agentkit.yaml
-# ═══════════════════════════════════════════════════════════════════════════
-
-
-def _find_config_path() -> str | None:
-    """Find agentkit.yaml in standard search paths."""
-    candidates = [
-        os.environ.get("AGENTKIT_CONFIG", ""),
-        str(Path.cwd() / "agentkit.yaml"),
-        str(Path.home() / ".agentkit" / "agentkit.yaml"),
-    ]
-    for path in candidates:
-        if path and Path(path).is_file():
-            return path
-    return None
-
-
-def _build_real_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRouter]:
-    """Build real components from agentkit.yaml configuration.
-
-    Returns (router, skill_registry, intent_router).
-    Raises skip if no valid LLM provider is configured.
-    """
-    config_path = _find_config_path()
-    if not config_path:
-        pytest.skip("No agentkit.yaml found — cannot build real components")
-
-    # Load .env if present
-    env_path = Path(config_path).parent / ".env"
-    if env_path.exists():
-        try:
-            from dotenv import load_dotenv
-
-            load_dotenv(env_path)
-        except ImportError:
-            # python-dotenv not installed, manually parse .env
-            with open(env_path) as f:
-                for line in f:
-                    line = line.strip()
-                    if line and not line.startswith("#") and "=" in line:
-                        key, _, value = line.partition("=")
-                        os.environ.setdefault(key.strip(), value.strip().strip("'\""))
-
-    server_config = ServerConfig.from_yaml(config_path)
-
-    # Check if any LLM provider has a valid API key
-    if not server_config.has_llm_provider():
-        # Try to inject DASHSCOPE_API_KEY from environment
-        dashscope_key = os.environ.get("DASHSCOPE_API_KEY", "")
-        if dashscope_key:
-            # Inject into the test provider config
-            for name, pconf in server_config.llm_config.providers.items():
-                if not pconf.api_key:
-                    pconf.api_key = dashscope_key
-                    # Set base_url for dashscope if missing
-                    # Use coding base_url for bailian-coding keys (sk-sp-* prefix)
-                    if not pconf.base_url:
-                        if dashscope_key.startswith("sk-sp-"):
-                            pconf.base_url = "https://coding.dashscope.aliyuncs.com/v1"
-                        else:
-                            pconf.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
-                    break
-
-    if not server_config.has_llm_provider():
-        pytest.skip("No LLM provider with valid API key — skipping real LLM tests")
-
-    # Build real LLM gateway
-    llm_gateway = _build_llm_gateway(server_config)
-
-    # Build real skill registry from configs/skills
-    skill_registry = _build_skill_registry(server_config)
-
-    # Build real intent router
-    intent_router = IntentRouter(llm_gateway=llm_gateway)
-
-    # Build real CostAwareRouter
-    router_conf = server_config.router or {}
-
-    # Build SemanticRouter if enabled or if embedding is available
-    semantic_router = None
-    semantic_conf = router_conf.get("semantic", {})
-    if semantic_conf.get("enabled", False):
-        try:
-            from agentkit.chat.semantic_router import SemanticRouter
-            from agentkit.memory.embedder import OpenAIEmbedder
-
-            # Try to get embedder from LLM gateway cache first
-            embedder = getattr(llm_gateway, "_embedder", None)
-
-            # If no cache embedder, create one directly from provider config
-            if embedder is None:
-                # Find a provider with an API key to use for embedding
-                for pname, pconf in server_config.llm_config.providers.items():
-                    if pconf.api_key:
-                        # Use correct base_url based on key prefix
-                        if pconf.api_key.startswith("sk-sp-"):
-                            base_url = pconf.base_url or "https://coding.dashscope.aliyuncs.com/v1"
-                        else:
-                            base_url = pconf.base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1"
-                        embedder = OpenAIEmbedder(
-                            api_key=pconf.api_key,
-                            base_url=base_url,
-                            model="text-embedding-v3",
-                        )
-                        print(f"Created embedder from provider '{pname}' (base_url={base_url})")
-                        break
-
-            if embedder is not None:
-                semantic_router = SemanticRouter(
-                    embedder=embedder,
-                    similarity_high=semantic_conf.get("similarity_high", 0.85),
-                    similarity_low=semantic_conf.get("similarity_low", 0.4),
-                )
-                # Build skill embedding index
-                import asyncio
-
-                try:
-                    loop = asyncio.get_running_loop()
-                except RuntimeError:
-                    loop = None
-
-                if loop and loop.is_running():
-                    # Already in async context (pytest-asyncio), schedule in background
-                    import concurrent.futures
-
-                    with concurrent.futures.ThreadPoolExecutor() as pool:
-                        pool.submit(asyncio.run, semantic_router.build_index(skill_registry)).result()
-                else:
-                    asyncio.run(semantic_router.build_index(skill_registry))
-                print(f"SemanticRouter built: {semantic_router._index.size} skills indexed")
-            else:
-                print("Warning: No embedder available for SemanticRouter")
-        except Exception as e:
-            print(f"Warning: SemanticRouter not available: {e}")
-
-    router = CostAwareRouter(
-        llm_gateway=llm_gateway,
-        model="default",
-        org_context=None,
-        auction_enabled=router_conf.get("auction_enabled", False),
-        classifier=router_conf.get("classifier", "heuristic"),
-        merged_llm_classify=router_conf.get("merged_llm_classify", True),
-        semantic_router=semantic_router,
-    )
-
-    return router, skill_registry, intent_router
-
-
-# Cache components at module level to avoid rebuilding for every test
-_cached_components: tuple[CostAwareRouter, SkillRegistry, IntentRouter] | None = None
-
-
-def _get_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRouter]:
-    """Get or build real components (cached for session)."""
-    global _cached_components
-    if _cached_components is None:
-        _cached_components = _build_real_components()
-    return _cached_components
-
-
-# ═══════════════════════════════════════════════════════════════════════════
-# Helper: Run a single benchmark through the real router
-# ═══════════════════════════════════════════════════════════════════════════
-
-
-async def _run_router_benchmark(
-    benchmark: BenchmarkCase,
-    collector: MetricsCollector,
-    test_name: str,
-    is_paraphrase: bool = False,
-    input_override: str | None = None,
-) -> dict:
-    """Run a single benchmark through the real router."""
-    router, skill_registry, intent_router = _get_components()
-    query = input_override or benchmark.input
-
-    collector.start_timer(benchmark.id)
-
-    try:
-        result = await router.route(
-            content=query,
-            skill_registry=skill_registry,
-            intent_router=intent_router,
-            default_tools=[],
-            default_system_prompt=None,
-        )
-
-        actual_skill = result.skill_name
-        actual_exec_mode = result.execution_mode.value if result.execution_mode else None
-        actual_complexity = result.complexity
-        actual_match_method = result.match_method
-        actual_match_confidence = result.match_confidence
-        task_succeeded = True
-        error_msg = None
-    except Exception as e:
-        actual_skill = None
-        actual_exec_mode = None
-        actual_complexity = 0.0
-        actual_match_method = None
-        actual_match_confidence = 0.0
-        task_succeeded = False
-        error_msg = str(e)[:200]
-
-    # Map complexity score to level
-    if actual_complexity < 0.3:
-        actual_complexity_level = "low"
-    elif actual_complexity < 0.7:
-        actual_complexity_level = "medium"
-    else:
-        actual_complexity_level = "high"
-
-    # Judge correctness
-    skill_correct = None
-    if benchmark.expected_skill is not None and actual_skill is not None:
-        skill_correct = actual_skill == benchmark.expected_skill
-    elif benchmark.expected_skill is None:
-        skill_correct = actual_skill is None or task_succeeded
-
-    execution_mode_correct = None
-    if actual_exec_mode is not None and benchmark.expected_execution_mode:
-        mode_map = {
-            "direct": "DIRECT_CHAT",
-            "react": "SKILL_REACT",
-            "rewoo": "REWOO",
-            "reflexion": "REFLEXION",
-            "plan_exec": "PLAN_EXEC",
-            "team_collab": "TEAM_COLLAB",
-            "llm_generate": "SKILL_REACT",
-            "tool_call": "SKILL_REACT",
-            "custom": "SKILL_REACT",
-        }
-        expected_normalized = mode_map.get(
-            benchmark.expected_execution_mode, benchmark.expected_execution_mode.upper()
-        )
-        execution_mode_correct = actual_exec_mode.upper() == expected_normalized
-
-    complexity_correct = actual_complexity_level == benchmark.expected_complexity
-
-    obs = collector.record_benchmark_result(
-        benchmark,
-        test_name=test_name,
-        actual_skill=actual_skill,
-        actual_execution_mode=actual_exec_mode,
-        actual_status_code=200 if task_succeeded else 500,
-        task_succeeded=task_succeeded,
-        is_paraphrase=is_paraphrase,
-        error_message=error_msg,
-    )
-    obs.complexity_correct = complexity_correct
-
-    return {
-        "skill_correct": skill_correct,
-        "execution_mode_correct": execution_mode_correct,
-        "complexity_correct": complexity_correct,
-        "actual_skill": actual_skill,
-        "actual_exec_mode": actual_exec_mode,
-        "actual_complexity": actual_complexity,
-        "actual_complexity_level": actual_complexity_level,
-        "actual_match_method": actual_match_method,
-        "actual_match_confidence": actual_match_confidence,
-        "task_succeeded": task_succeeded,
-    }
-
-
-# ═══════════════════════════════════════════════════════════════════════════
-# Layer 0: Rule Matching Tests
-# ═══════════════════════════════════════════════════════════════════════════
-
-
-@pytest.mark.e2e_capability
-class TestRouterLayer0:
-    """Test Layer 0 rule matching with real router."""
-
-    @pytest.mark.parametrize(
-        "benchmark",
-        [
-            b
-            for b in ROUTING_EDGE_BENCHMARKS
-            if b.subcategory in ("greeting", "identity", "explicit_prefix")
-        ],
-        ids=[
-            b.id
-            for b in ROUTING_EDGE_BENCHMARKS
-            if b.subcategory in ("greeting", "identity", "explicit_prefix")
-        ],
-    )
-    def test_layer0_rules(self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector):
-        """Layer 0 should correctly match greetings, identity, and @skill: prefix."""
-        result = asyncio.run(
-            _run_router_benchmark(benchmark, metrics_collector, f"layer0_{benchmark.id}")
-        )
-        if benchmark.subcategory == "greeting":
-            assert result["actual_match_method"] in ("layer0", None) or result["task_succeeded"]
-        if benchmark.subcategory == "explicit_prefix":
-            assert result["actual_skill"] == benchmark.expected_skill or result["task_succeeded"]
-
-
-# ═══════════════════════════════════════════════════════════════════════════
-# Layer 1: Complexity Classification Tests
-# ═══════════════════════════════════════════════════════════════════════════
-
-
-@pytest.mark.e2e_capability
-class TestRouterLayer1:
-    """Test Layer 1 complexity classification with real router."""
-
-    @pytest.mark.parametrize(
-        "benchmark",
-        ROUTING_KEYWORD_BENCHMARKS,
-        ids=[b.id for b in ROUTING_KEYWORD_BENCHMARKS],
-    )
-    def test_complexity_classification(
-        self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector
-    ):
-        """HeuristicClassifier should correctly estimate complexity."""
-        asyncio.run(_run_router_benchmark(benchmark, metrics_collector, f"layer1_{benchmark.id}"))
-
-
-# ═══════════════════════════════════════════════════════════════════════════
-# Semantic Router Tests
-# ═══════════════════════════════════════════════════════════════════════════
-
-
-@pytest.mark.e2e_capability
-class TestSemanticRouter:
-    """Test semantic router matching with real router."""
-
-    @pytest.mark.parametrize(
-        "benchmark",
-        SEMANTIC_ROUTER_BENCHMARKS,
-        ids=[b.id for b in SEMANTIC_ROUTER_BENCHMARKS],
-    )
-    def test_semantic_match(self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector):
-        """SemanticRouter should match skill descriptions."""
-        asyncio.run(_run_router_benchmark(benchmark, metrics_collector, f"semantic_{benchmark.id}"))
-
-
-# ═══════════════════════════════════════════════════════════════════════════
-# Paraphrase Consistency Tests (Overfitting Detection)
-# ═══════════════════════════════════════════════════════════════════════════
-
-
-@pytest.mark.e2e_capability
-class TestRouterParaphraseConsistency:
-    """Test that paraphrased inputs route to the same skill as originals."""
-
-    @pytest.mark.parametrize(
-        "benchmark",
-        [b for b in ALL_BENCHMARKS if b.paraphrases and b.expected_skill is not None][:10],
-        ids=[b.id for b in ALL_BENCHMARKS if b.paraphrases and b.expected_skill is not None][:10],
-    )
-    def test_paraphrase_routes_same_skill(
-        self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector
-    ):
-        """Original and paraphrased inputs should route to the same skill."""
-        # Run original
-        asyncio.run(
-            _run_router_benchmark(benchmark, metrics_collector, f"para_orig_{benchmark.id}")
-        )
-
-        # Run paraphrases
-        for i, para in enumerate(benchmark.paraphrases):
-            asyncio.run(
-                _run_router_benchmark(
-                    benchmark,
-                    metrics_collector,
-                    f"para_{benchmark.id}_{i}",
-                    is_paraphrase=True,
-                    input_override=para,
-                )
-            )