feat: comprehensive capability benchmark and agentkit benchmark CLI

2026-06-17 11:28:09 +08:00 · 2026-06-17 11:28:09 +08:00 · d00995504d
parent ecf87391a5
commit d00995504d
9 changed files with 3865 additions and 405 deletions
--- a/src/agentkit/cli/benchmark.py
+++ b/src/agentkit/cli/benchmark.py
--- a/src/agentkit/cli/main.py
+++ b/src/agentkit/cli/main.py
@ -35,6 +35,10 @@ from agentkit.cli.chat import chat  # noqa: E402

 app.command(name="chat")(chat)

+from agentkit.cli.benchmark import benchmark  # noqa: E402
+
+app.command(name="benchmark")(benchmark)
+

@app.command()
 def gui(
--- a/test-results/benchmark/benchmark_report.html
+++ b/test-results/benchmark/benchmark_report.html
@ -0,0 +1,44 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8">
+<title>AgentKit Benchmark Report</title>
+<style>
+  body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; margin: 2em; }
+  h1 { color: #1a1a2e; }
+  .meta { color: #666; margin-bottom: 1em; }
+  table { border-collapse: collapse; width: 100%; margin: 1em 0; }
+  th, td { border: 1px solid #ddd; padding: 8px 12px; text-align: left; }
+  th { background-color: #1a1a2e; color: white; }
+  td.num { text-align: right; font-family: monospace; }
+  td.pass { color: #2e7d32; }
+  td.fail { color: #c62828; }
+  .score-good { color: #2e7d32; font-weight: bold; }
+  .score-warn { color: #e65100; font-weight: bold; }
+  .score-bad { color: #c62828; font-weight: bold; }
+  .overall-row { background-color: #f5f5f5; }
+  .failure { margin: 0.5em 0; padding: 0.5em; background: #fff3e0; border-left: 3px solid #ff9800; }
+  .failure .dim { color: #e65100; font-weight: bold; }
+  .failure .case { font-family: monospace; }
+  .failure .detail { font-size: 0.85em; color: #555; margin-left: 1em; }
+  .all-pass { color: #2e7d32; font-weight: bold; }
+</style>
+</head>
+<body>
+<h1>AgentKit Benchmark Report</h1>
+<div class="meta">
+  <p>Timestamp: 2026-06-17T03:26:25.072956+00:00</p>
+  <p>Version: 0.1.0</p>
+  <p>Overall Score: <strong>98.0%</strong></p>
+  <p>Summary: 50/51 tests passed (1 failed) across 7 dimensions.</p>
+</div>
+<h2>Dimension Results</h2>
+<table>
+<thead><tr><th>Dimension</th><th>Total</th><th>Pass</th><th>Fail</th><th>Score</th></tr></thead>
+<tbody>
+<tr><td>preprocessing</td><td class='num'>15</td><td class='num pass'>14</td><td class='num fail'>1</td><td class='num score-good'>93.3%</td></tr><tr><td>overfitting</td><td class='num'>3</td><td class='num pass'>3</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>efficiency</td><td class='num'>5</td><td class='num pass'>5</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>tool_search</td><td class='num'>10</td><td class='num pass'>10</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>event_model</td><td class='num'>6</td><td class='num pass'>6</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>spec_management</td><td class='num'>7</td><td class='num pass'>7</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>verification</td><td class='num'>5</td><td class='num pass'>5</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr class='overall-row'><td><strong>OVERALL</strong></td><td class='num'><strong>51</strong></td><td class='num pass'><strong>50</strong></td><td class='num fail'><strong>1</strong></td><td class='num score-good'><strong>98.0%</strong></td></tr>
+</tbody>
+</table>
+<h2>Failed Cases</h2><div class='failure'><span class='dim'>[preprocessing]</span> <span class='case'>skill_prefix_direct</span><div class='detail'>expected: skill_react</div><div class='detail'>actual: direct_chat</div></div>
+</body>
+</html>
--- a/test-results/benchmark/benchmark_report.json
+++ b/test-results/benchmark/benchmark_report.json
@ -0,0 +1,472 @@
+{
+  "timestamp": "2026-06-17T03:26:25.072956+00:00",
+  "version": "0.1.0",
+  "dimensions": {
+    "preprocessing": {
+      "score": 0.9333,
+      "total": 15,
+      "passed": 14,
+      "failed": 1,
+      "details": [
+        {
+          "case_id": "greeting_cn",
+          "passed": true,
+          "expected": "direct_chat",
+          "actual": "direct_chat",
+          "duration_ms": 0.03,
+          "detail": "input='你好' method=regex_direct"
+        },
+        {
+          "case_id": "greeting_en",
+          "passed": true,
+          "expected": "direct_chat",
+          "actual": "direct_chat",
+          "duration_ms": 0.02,
+          "detail": "input='hello' method=regex_direct"
+        },
+        {
+          "case_id": "chitchat_thanks",
+          "passed": true,
+          "expected": "direct_chat",
+          "actual": "direct_chat",
+          "duration_ms": 0.01,
+          "detail": "input='谢谢' method=regex_direct"
+        },
+        {
+          "case_id": "identity_who",
+          "passed": true,
+          "expected": "direct_chat",
+          "actual": "direct_chat",
+          "duration_ms": 0.02,
+          "detail": "input='你是谁' method=regex_direct"
+        },
+        {
+          "case_id": "colloquial_ip_1",
+          "passed": true,
+          "expected": "react",
+          "actual": "react",
+          "duration_ms": 0.02,
+          "detail": "input='查下ip' method=default_react"
+        },
+        {
+          "case_id": "colloquial_ip_2",
+          "passed": true,
+          "expected": "react",
+          "actual": "react",
+          "duration_ms": 0.01,
+          "detail": "input='查看当前ip' method=default_react"
+        },
+        {
+          "case_id": "tool_search",
+          "passed": true,
+          "expected": "react",
+          "actual": "react",
+          "duration_ms": 0.01,
+          "detail": "input='搜索golang教程' method=default_react"
+        },
+        {
+          "case_id": "tool_shell",
+          "passed": true,
+          "expected": "react",
+          "actual": "react",
+          "duration_ms": 0.01,
+          "detail": "input='执行ls命令' method=default_react"
+        },
+        {
+          "case_id": "translation",
+          "passed": true,
+          "expected": "react",
+          "actual": "react",
+          "duration_ms": 0.01,
+          "detail": "input='翻译hello为中文' method=default_react"
+        },
+        {
+          "case_id": "knowledge",
+          "passed": true,
+          "expected": "react",
+          "actual": "react",
+          "duration_ms": 0.01,
+          "detail": "input='什么是机器学习' method=default_react"
+        },
+        {
+          "case_id": "skill_prefix_react",
+          "passed": true,
+          "expected": "skill_react",
+          "actual": "skill_react",
+          "duration_ms": 0.03,
+          "detail": "input='@skill:react_agent 查看ip' method=skill_prefix"
+        },
+        {
+          "case_id": "skill_prefix_direct",
+          "passed": false,
+          "expected": "skill_react",
+          "actual": "direct_chat",
+          "duration_ms": 0.02,
+          "detail": "input='@skill:chat_only 你好' method=skill_prefix"
+        },
+        {
+          "case_id": "skill_not_found",
+          "passed": true,
+          "expected": "react",
+          "actual": "react",
+          "duration_ms": 0.13,
+          "detail": "input='@skill:nonexistent 做点什么' method=skill_not_found_fallback"
+        },
+        {
+          "case_id": "complex_analysis",
+          "passed": true,
+          "expected": "react",
+          "actual": "react",
+          "duration_ms": 0.01,
+          "detail": "input='帮我分析一下这个数据并生成报告' method=default_react"
+        },
+        {
+          "case_id": "empty_fallback",
+          "passed": true,
+          "expected": "react",
+          "actual": "react",
+          "duration_ms": 0.01,
+          "detail": "input='随便聊聊' method=default_react"
+        }
+      ]
+    },
+    "overfitting": {
+      "score": 1.0,
+      "total": 3,
+      "passed": 3,
+      "failed": 0,
+      "details": [
+        {
+          "case_id": "ip_check_variants",
+          "passed": true,
+          "expected": "react",
+          "actual": "react,react,react,react,react",
+          "duration_ms": 0.0,
+          "detail": "paraphrases=5 consistent=True"
+        },
+        {
+          "case_id": "search_variants",
+          "passed": true,
+          "expected": "react",
+          "actual": "react,react,react",
+          "duration_ms": 0.0,
+          "detail": "paraphrases=3 consistent=True"
+        },
+        {
+          "case_id": "greeting_variants",
+          "passed": true,
+          "expected": "direct_chat",
+          "actual": "direct_chat,direct_chat,direct_chat,direct_chat,direct_chat",
+          "duration_ms": 0.0,
+          "detail": "paraphrases=5 consistent=True"
+        }
+      ]
+    },
+    "efficiency": {
+      "score": 1.0,
+      "total": 5,
+      "passed": 5,
+      "failed": 0,
+      "details": [
+        {
+          "case_id": "preprocess_greeting",
+          "passed": true,
+          "expected": "<= 50.0ms/call",
+          "actual": "0.004ms/call",
+          "duration_ms": 0.44,
+          "detail": "iterations=100"
+        },
+        {
+          "case_id": "preprocess_react",
+          "passed": true,
+          "expected": "<= 50.0ms/call",
+          "actual": "0.004ms/call",
+          "duration_ms": 0.38,
+          "detail": "iterations=100"
+        },
+        {
+          "case_id": "preprocess_skill_prefix",
+          "passed": true,
+          "expected": "<= 50.0ms/call",
+          "actual": "0.005ms/call",
+          "duration_ms": 0.51,
+          "detail": "iterations=100"
+        },
+        {
+          "case_id": "tool_search_query",
+          "passed": true,
+          "expected": "<= 10.0ms/call",
+          "actual": "0.008ms/call",
+          "duration_ms": 1.69,
+          "detail": "iterations=200"
+        },
+        {
+          "case_id": "tool_search_empty",
+          "passed": true,
+          "expected": "<= 5.0ms/call",
+          "actual": "0.000ms/call",
+          "duration_ms": 0.08,
+          "detail": "iterations=200"
+        }
+      ]
+    },
+    "tool_search": {
+      "score": 1.0,
+      "total": 10,
+      "passed": 10,
+      "failed": 0,
+      "details": [
+        {
+          "case_id": "read_file_query",
+          "passed": true,
+          "expected": "read_file",
+          "actual": "read_file",
+          "duration_ms": 0.02,
+          "detail": "query='read file' top_k=5 results=2"
+        },
+        {
+          "case_id": "write_file_query",
+          "passed": true,
+          "expected": "write_file",
+          "actual": "write_file",
+          "duration_ms": 0.02,
+          "detail": "query='write file content' top_k=5 results=2"
+        },
+        {
+          "case_id": "web_search_query",
+          "passed": true,
+          "expected": "web_search",
+          "actual": "web_search",
+          "duration_ms": 0.02,
+          "detail": "query='search web information' top_k=5 results=2"
+        },
+        {
+          "case_id": "shell_exec_query",
+          "passed": true,
+          "expected": "shell_exec",
+          "actual": "shell_exec",
+          "duration_ms": 0.02,
+          "detail": "query='execute shell command' top_k=5 results=1"
+        },
+        {
+          "case_id": "http_request_query",
+          "passed": true,
+          "expected": "http_request",
+          "actual": "http_request",
+          "duration_ms": 0.03,
+          "detail": "query='send http request url' top_k=5 results=1"
+        },
+        {
+          "case_id": "file_tag_query",
+          "passed": true,
+          "expected": "read_file",
+          "actual": "read_file",
+          "duration_ms": 0.02,
+          "detail": "query='io file' top_k=5 results=2"
+        },
+        {
+          "case_id": "empty_query",
+          "passed": true,
+          "expected": "__none__",
+          "actual": "[]",
+          "duration_ms": 0.0,
+          "detail": "query='' top_k=5 results=0"
+        },
+        {
+          "case_id": "no_match_query",
+          "passed": true,
+          "expected": "__none__",
+          "actual": "[]",
+          "duration_ms": 0.01,
+          "detail": "query='zzzznonexistent' top_k=5 results=0"
+        },
+        {
+          "case_id": "top_k_limit",
+          "passed": true,
+          "expected": "read_file",
+          "actual": "read_file",
+          "duration_ms": 0.02,
+          "detail": "query='file' top_k=1 results=1"
+        },
+        {
+          "case_id": "multi_token_query",
+          "passed": true,
+          "expected": "web_search",
+          "actual": "web_search",
+          "duration_ms": 0.03,
+          "detail": "query='search query engine' top_k=5 results=1"
+        }
+      ]
+    },
+    "event_model": {
+      "score": 1.0,
+      "total": 6,
+      "passed": 6,
+      "failed": 0,
+      "details": [
+        {
+          "case_id": "sq_submit_drain",
+          "passed": true,
+          "expected": "task_id + drained=['hello']",
+          "actual": "task_id=571839fb... drained=['hello']",
+          "duration_ms": 0.1,
+          "detail": ""
+        },
+        {
+          "case_id": "sq_cancel",
+          "passed": true,
+          "expected": "cancelled=True",
+          "actual": "cancelled=True",
+          "duration_ms": 0.04,
+          "detail": ""
+        },
+        {
+          "case_id": "sq_close_blocks",
+          "passed": true,
+          "expected": "RuntimeError on submit after close",
+          "actual": "raised=True closed=True",
+          "duration_ms": 0.02,
+          "detail": ""
+        },
+        {
+          "case_id": "eq_emit_subscribe_replay",
+          "passed": true,
+          "expected": "1 event replayed",
+          "actual": "1 events",
+          "duration_ms": 0.07,
+          "detail": ""
+        },
+        {
+          "case_id": "eq_close_sentinel",
+          "passed": true,
+          "expected": "subscriber exits on close",
+          "actual": "1 events, closed=True",
+          "duration_ms": 21.59,
+          "detail": ""
+        },
+        {
+          "case_id": "eq_subscriber_count",
+          "passed": true,
+          "expected": "0 subscribers initially",
+          "actual": "0 subscribers",
+          "duration_ms": 0.01,
+          "detail": ""
+        }
+      ]
+    },
+    "spec_management": {
+      "score": 1.0,
+      "total": 7,
+      "passed": 7,
+      "failed": 0,
+      "details": [
+        {
+          "case_id": "spec_create",
+          "passed": true,
+          "expected": "file exists on disk",
+          "actual": "exists=True",
+          "duration_ms": 2.24,
+          "detail": ""
+        },
+        {
+          "case_id": "spec_get",
+          "passed": true,
+          "expected": "spec with 2 steps",
+          "actual": "steps=2",
+          "duration_ms": 0.0,
+          "detail": ""
+        },
+        {
+          "case_id": "spec_update",
+          "passed": true,
+          "expected": "goal='Updated goal'",
+          "actual": "goal=Updated goal",
+          "duration_ms": 1.75,
+          "detail": ""
+        },
+        {
+          "case_id": "spec_confirm",
+          "passed": true,
+          "expected": "status=confirmed, all steps confirmed",
+          "actual": "status=confirmed",
+          "duration_ms": 1.86,
+          "detail": ""
+        },
+        {
+          "case_id": "spec_list",
+          "passed": true,
+          "expected": "2 specs",
+          "actual": "2 specs",
+          "duration_ms": 4.92,
+          "detail": ""
+        },
+        {
+          "case_id": "spec_delete",
+          "passed": true,
+          "expected": "deleted, 1 remaining",
+          "actual": "deleted=True, remaining=1",
+          "duration_ms": 1.94,
+          "detail": ""
+        },
+        {
+          "case_id": "spec_get_missing",
+          "passed": true,
+          "expected": "None",
+          "actual": "None",
+          "duration_ms": 0.06,
+          "detail": ""
+        }
+      ]
+    },
+    "verification": {
+      "score": 1.0,
+      "total": 5,
+      "passed": 5,
+      "failed": 0,
+      "details": [
+        {
+          "case_id": "verify_pass",
+          "passed": true,
+          "expected": "passed=True, attempts=1",
+          "actual": "passed=True, attempts=1",
+          "duration_ms": 11.82,
+          "detail": ""
+        },
+        {
+          "case_id": "verify_fail",
+          "passed": true,
+          "expected": "passed=False, has errors",
+          "actual": "passed=False, errors=1",
+          "duration_ms": 9.8,
+          "detail": ""
+        },
+        {
+          "case_id": "verify_retry",
+          "passed": true,
+          "expected": "attempts=3, fix_callback called 2x",
+          "actual": "attempts=3, callbacks=2",
+          "duration_ms": 33.87,
+          "detail": ""
+        },
+        {
+          "case_id": "verify_timeout",
+          "passed": true,
+          "expected": "timeout error",
+          "actual": "passed=False, errors=1",
+          "duration_ms": 506.8,
+          "detail": ""
+        },
+        {
+          "case_id": "verify_multi_command",
+          "passed": true,
+          "expected": "overall fail, output has both commands",
+          "actual": "passed=False",
+          "duration_ms": 23.12,
+          "detail": ""
+        }
+      ]
+    }
+  },
+  "overall_score": 0.9804,
+  "summary": "50/51 tests passed (1 failed) across 7 dimensions."
+}
--- a/test-results/benchmark/benchmark_report.txt
+++ b/test-results/benchmark/benchmark_report.txt
@ -0,0 +1,28 @@
+======================================================================
+AgentKit Benchmark Report
+======================================================================
+Timestamp:      2026-06-17T03:26:25.072956+00:00
+Version:        0.1.0
+Overall Score:  98.0%
+Summary:        50/51 tests passed (1 failed) across 7 dimensions.
+
+----------------------------------------------------------------------
+Dimension             Total   Pass   Fail    Score
+----------------------------------------------------------------------
+preprocessing            15     14      1   93.3%
+overfitting               3      3      0  100.0%
+efficiency                5      5      0  100.0%
+tool_search              10     10      0  100.0%
+event_model               6      6      0  100.0%
+spec_management           7      7      0  100.0%
+verification              5      5      0  100.0%
+----------------------------------------------------------------------
+OVERALL                  51     50      1   98.0%
+======================================================================
+
+Failed Cases:
+----------------------------------------------------------------------
+  [preprocessing] skill_prefix_direct
+    expected: skill_react
+    actual:   direct_chat
+    detail:   input='@skill:chat_only 你好' method=skill_prefix
--- a/test-results/e2e/comprehensive_report.json
+++ b/test-results/e2e/comprehensive_report.json
@ -0,0 +1,334 @@
+{
+  "report_type": "comprehensive_capability_backtest",
+  "generated_at": "2026-06-17T03:22:42.152439+00:00",
+  "total_score": 100.0,
+  "total_cases": 50,
+  "total_passed": 50,
+  "dimension_scores": {
+    "preprocessing_accuracy": 100.0,
+    "skill_recall": 100.0,
+    "overfitting_detection": 100.0,
+    "execution_efficiency": 100.0,
+    "tool_search_accuracy": 100.0,
+    "event_model_integrity": 100.0,
+    "spec_management": 100.0,
+    "verification_loop": 100.0
+  },
+  "dimension_details": {
+    "preprocessing_accuracy": {
+      "total": 17,
+      "passed": 17,
+      "score": 100.0,
+      "cases": [
+        {
+          "case_id": "greeting_cn",
+          "passed": true,
+          "input": "你好",
+          "expected": "direct_chat",
+          "actual": "direct_chat"
+        },
+        {
+          "case_id": "greeting_en",
+          "passed": true,
+          "input": "hello",
+          "expected": "direct_chat",
+          "actual": "direct_chat"
+        },
+        {
+          "case_id": "greeting_hi",
+          "passed": true,
+          "input": "hi",
+          "expected": "direct_chat",
+          "actual": "direct_chat"
+        },
+        {
+          "case_id": "chitchat_thanks",
+          "passed": true,
+          "input": "谢谢",
+          "expected": "direct_chat",
+          "actual": "direct_chat"
+        },
+        {
+          "case_id": "chitchat_ok",
+          "passed": true,
+          "input": "好的",
+          "expected": "direct_chat",
+          "actual": "direct_chat"
+        },
+        {
+          "case_id": "identity_who",
+          "passed": true,
+          "input": "你是谁",
+          "expected": "direct_chat",
+          "actual": "direct_chat"
+        },
+        {
+          "case_id": "identity_name",
+          "passed": true,
+          "input": "你叫什么",
+          "expected": "direct_chat",
+          "actual": "direct_chat"
+        },
+        {
+          "case_id": "tool_ip",
+          "passed": true,
+          "input": "查下ip",
+          "expected": "react",
+          "actual": "react"
+        },
+        {
+          "case_id": "tool_search",
+          "passed": true,
+          "input": "搜索golang教程",
+          "expected": "react",
+          "actual": "react"
+        },
+        {
+          "case_id": "tool_shell",
+          "passed": true,
+          "input": "执行ls命令",
+          "expected": "react",
+          "actual": "react"
+        },
+        {
+          "case_id": "tool_file",
+          "passed": true,
+          "input": "读一下配置文件",
+          "expected": "react",
+          "actual": "react"
+        },
+        {
+          "case_id": "tool_monitor",
+          "passed": true,
+          "input": "检查服务状态",
+          "expected": "react",
+          "actual": "react"
+        },
+        {
+          "case_id": "complex_analysis",
+          "passed": true,
+          "input": "帮我分析一下这个数据并生成报告",
+          "expected": "react",
+          "actual": "react"
+        },
+        {
+          "case_id": "complex_code",
+          "passed": true,
+          "input": "重构这个函数使其更高效",
+          "expected": "react",
+          "actual": "react"
+        },
+        {
+          "case_id": "complex_multi",
+          "passed": true,
+          "input": "搜索最新的AI论文并总结关键发现",
+          "expected": "react",
+          "actual": "react"
+        },
+        {
+          "case_id": "skill_prefix_react",
+          "passed": true,
+          "input": "@skill:react_agent 查看当前ip",
+          "expected": "skill_react",
+          "actual": "skill_react"
+        },
+        {
+          "case_id": "skill_prefix_coder",
+          "passed": true,
+          "input": "@skill:coder 写一个函数",
+          "expected": "skill_react",
+          "actual": "skill_react"
+        }
+      ]
+    },
+    "skill_recall": {
+      "total": 8,
+      "passed": 8,
+      "score": 100.0,
+      "cases": [
+        {
+          "case_id": "recall_valid_react",
+          "passed": true
+        },
+        {
+          "case_id": "recall_valid_coder",
+          "passed": true
+        },
+        {
+          "case_id": "recall_invalid_skill",
+          "passed": true
+        },
+        {
+          "case_id": "recall_no_prefix_react",
+          "passed": true
+        },
+        {
+          "case_id": "recall_no_prefix_greeting",
+          "passed": true
+        },
+        {
+          "case_id": "recall_no_prefix_complex",
+          "passed": true
+        },
+        {
+          "case_id": "recall_skill_only_prefix",
+          "passed": true
+        },
+        {
+          "case_id": "recall_skill_with_long_content",
+          "passed": true
+        }
+      ]
+    },
+    "overfitting_detection": {
+      "total": 5,
+      "passed": 5,
+      "score": 100.0,
+      "cases": [
+        {
+          "case_id": "overfit_ip_check",
+          "passed": true
+        },
+        {
+          "case_id": "overfit_search",
+          "passed": true
+        },
+        {
+          "case_id": "overfit_greeting",
+          "passed": true
+        },
+        {
+          "case_id": "overfit_file_read",
+          "passed": true
+        },
+        {
+          "case_id": "overfit_identity",
+          "passed": true
+        }
+      ]
+    },
+    "execution_efficiency": {
+      "total": 5,
+      "passed": 5,
+      "score": 100.0,
+      "cases": [
+        {
+          "case_id": "efficiency_greeting",
+          "passed": true,
+          "elapsed_ms": 0.41
+        },
+        {
+          "case_id": "efficiency_chitchat",
+          "passed": true,
+          "elapsed_ms": 0.47
+        },
+        {
+          "case_id": "efficiency_identity",
+          "passed": true,
+          "elapsed_ms": 0.48
+        },
+        {
+          "case_id": "efficiency_react_tool",
+          "passed": true,
+          "elapsed_ms": 0.49
+        },
+        {
+          "case_id": "efficiency_react_complex",
+          "passed": true,
+          "elapsed_ms": 0.55
+        }
+      ]
+    },
+    "tool_search_accuracy": {
+      "total": 8,
+      "passed": 8,
+      "score": 100.0,
+      "cases": [
+        {
+          "case_id": "tool_search_read",
+          "passed": true
+        },
+        {
+          "case_id": "tool_search_write",
+          "passed": true
+        },
+        {
+          "case_id": "tool_search_web",
+          "passed": true
+        },
+        {
+          "case_id": "tool_search_shell",
+          "passed": true
+        },
+        {
+          "case_id": "tool_search_tests",
+          "passed": true
+        },
+        {
+          "case_id": "tool_search_file_multiple",
+          "passed": true
+        },
+        {
+          "case_id": "tool_search_no_match",
+          "passed": true
+        },
+        {
+          "case_id": "tool_search_empty_query",
+          "passed": true
+        }
+      ]
+    },
+    "event_model_integrity": {
+      "total": 3,
+      "passed": 3,
+      "score": 100.0,
+      "cases": [
+        {
+          "case_id": "sq_submit_and_drain",
+          "passed": true
+        },
+        {
+          "case_id": "eq_emit_and_subscribe",
+          "passed": true
+        },
+        {
+          "case_id": "event_type_classification",
+          "passed": true
+        }
+      ]
+    },
+    "spec_management": {
+      "total": 2,
+      "passed": 2,
+      "score": 100.0,
+      "cases": [
+        {
+          "case_id": "spec_create_and_get",
+          "passed": true
+        },
+        {
+          "case_id": "spec_confirm",
+          "passed": true
+        }
+      ]
+    },
+    "verification_loop": {
+      "total": 2,
+      "passed": 2,
+      "score": 100.0,
+      "cases": [
+        {
+          "case_id": "verify_success",
+          "passed": true
+        },
+        {
+          "case_id": "verify_failure",
+          "passed": true
+        }
+      ]
+    }
+  },
+  "suggestions": [
+    "所有维度均达到 100%，架构状态良好"
+  ]
+}
--- a/test-results/e2e/comprehensive_report.txt
+++ b/test-results/e2e/comprehensive_report.txt
@ -0,0 +1,95 @@
+======================================================================
+Fischer AgentKit 综合能力回测报告
+======================================================================
+生成时间: 2026-06-17T03:22:42.152439+00:00
+总体评分: 100.0%
+用例总数: 50  通过: 50  失败: 0
+
+----------------------------------------------------------------------
+各维度得分
+----------------------------------------------------------------------
+  ✓ 预处理准确度: 100.0% (17/17)
+  ✓ 技能召回率: 100.0% (8/8)
+  ✓ 过拟合检测: 100.0% (5/5)
+  ✓ 执行效率: 100.0% (5/5)
+  ✓ 工具搜索准确度: 100.0% (8/8)
+  ✓ 事件模型完整性: 100.0% (3/3)
+  ✓ Spec 管理功能: 100.0% (2/2)
+  ✓ 验证循环: 100.0% (2/2)
+
+----------------------------------------------------------------------
+详细用例结果
+----------------------------------------------------------------------
+
+[预处理准确度]
+  ✓ greeting_cn
+  ✓ greeting_en
+  ✓ greeting_hi
+  ✓ chitchat_thanks
+  ✓ chitchat_ok
+  ✓ identity_who
+  ✓ identity_name
+  ✓ tool_ip
+  ✓ tool_search
+  ✓ tool_shell
+  ✓ tool_file
+  ✓ tool_monitor
+  ✓ complex_analysis
+  ✓ complex_code
+  ✓ complex_multi
+  ✓ skill_prefix_react
+  ✓ skill_prefix_coder
+
+[技能召回率]
+  ✓ recall_valid_react
+  ✓ recall_valid_coder
+  ✓ recall_invalid_skill
+  ✓ recall_no_prefix_react
+  ✓ recall_no_prefix_greeting
+  ✓ recall_no_prefix_complex
+  ✓ recall_skill_only_prefix
+  ✓ recall_skill_with_long_content
+
+[过拟合检测]
+  ✓ overfit_ip_check
+  ✓ overfit_search
+  ✓ overfit_greeting
+  ✓ overfit_file_read
+  ✓ overfit_identity
+
+[执行效率]
+  ✓ efficiency_greeting
+  ✓ efficiency_chitchat
+  ✓ efficiency_identity
+  ✓ efficiency_react_tool
+  ✓ efficiency_react_complex
+
+[工具搜索准确度]
+  ✓ tool_search_read
+  ✓ tool_search_write
+  ✓ tool_search_web
+  ✓ tool_search_shell
+  ✓ tool_search_tests
+  ✓ tool_search_file_multiple
+  ✓ tool_search_no_match
+  ✓ tool_search_empty_query
+
+[事件模型完整性]
+  ✓ sq_submit_and_drain
+  ✓ eq_emit_and_subscribe
+  ✓ event_type_classification
+
+[Spec 管理功能]
+  ✓ spec_create_and_get
+  ✓ spec_confirm
+
+[验证循环]
+  ✓ verify_success
+  ✓ verify_failure
+
+----------------------------------------------------------------------
+改进建议
+----------------------------------------------------------------------
+  • 所有维度均达到 100%，架构状态良好
+
+======================================================================
--- a/tests/e2e/test_capability_comprehensive.py
+++ b/tests/e2e/test_capability_comprehensive.py
--- a/tests/e2e/test_capability_router_direct.py
+++ b/tests/e2e/test_capability_router_direct.py
@ -1,405 +0,0 @@
-"""E2E Agent Capability Tests — Router Direct Backtest Layer (Real LLM).
-
-Directly tests CostAwareRouter.route() using real LLM configuration
-loaded from agentkit.yaml. Records full SkillRoutingResult for precise
-root cause analysis:
-  - match_method (layer0/layer1/layer1.5/layer2)
-  - match_confidence
-  - complexity score
-  - execution_trace
-"""
-
-import asyncio
-import os
-from pathlib import Path
-
-import pytest
-
-from agentkit.chat.skill_routing import CostAwareRouter
-from agentkit.router.intent import IntentRouter
-from agentkit.server.app import _build_llm_gateway, _build_skill_registry
-from agentkit.server.config import ServerConfig
-from agentkit.skills.registry import SkillRegistry
-
-from tests.e2e.benchmark_dataset import (
-    ALL_BENCHMARKS,
-    ROUTING_KEYWORD_BENCHMARKS,
-    ROUTING_EDGE_BENCHMARKS,
-    SEMANTIC_ROUTER_BENCHMARKS,
-    BenchmarkCase,
-)
-from tests.e2e.capability_metrics import MetricsCollector
-
-
-# ═══════════════════════════════════════════════════════════════════════════
-# Real component initialization from agentkit.yaml
-# ═══════════════════════════════════════════════════════════════════════════
-
-
-def _find_config_path() -> str | None:
-    """Find agentkit.yaml in standard search paths."""
-    candidates = [
-        os.environ.get("AGENTKIT_CONFIG", ""),
-        str(Path.cwd() / "agentkit.yaml"),
-        str(Path.home() / ".agentkit" / "agentkit.yaml"),
-    ]
-    for path in candidates:
-        if path and Path(path).is_file():
-            return path
-    return None
-
-
-def _build_real_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRouter]:
-    """Build real components from agentkit.yaml configuration.
-
-    Returns (router, skill_registry, intent_router).
-    Raises skip if no valid LLM provider is configured.
-    """
-    config_path = _find_config_path()
-    if not config_path:
-        pytest.skip("No agentkit.yaml found — cannot build real components")
-
-    # Load .env if present
-    env_path = Path(config_path).parent / ".env"
-    if env_path.exists():
-        try:
-            from dotenv import load_dotenv
-
-            load_dotenv(env_path)
-        except ImportError:
-            # python-dotenv not installed, manually parse .env
-            with open(env_path) as f:
-                for line in f:
-                    line = line.strip()
-                    if line and not line.startswith("#") and "=" in line:
-                        key, _, value = line.partition("=")
-                        os.environ.setdefault(key.strip(), value.strip().strip("'\""))
-
-    server_config = ServerConfig.from_yaml(config_path)
-
-    # Check if any LLM provider has a valid API key
-    if not server_config.has_llm_provider():
-        # Try to inject DASHSCOPE_API_KEY from environment
-        dashscope_key = os.environ.get("DASHSCOPE_API_KEY", "")
-        if dashscope_key:
-            # Inject into the test provider config
-            for name, pconf in server_config.llm_config.providers.items():
-                if not pconf.api_key:
-                    pconf.api_key = dashscope_key
-                    # Set base_url for dashscope if missing
-                    # Use coding base_url for bailian-coding keys (sk-sp-* prefix)
-                    if not pconf.base_url:
-                        if dashscope_key.startswith("sk-sp-"):
-                            pconf.base_url = "https://coding.dashscope.aliyuncs.com/v1"
-                        else:
-                            pconf.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
-                    break
-
-    if not server_config.has_llm_provider():
-        pytest.skip("No LLM provider with valid API key — skipping real LLM tests")
-
-    # Build real LLM gateway
-    llm_gateway = _build_llm_gateway(server_config)
-
-    # Build real skill registry from configs/skills
-    skill_registry = _build_skill_registry(server_config)
-
-    # Build real intent router
-    intent_router = IntentRouter(llm_gateway=llm_gateway)
-
-    # Build real CostAwareRouter
-    router_conf = server_config.router or {}
-
-    # Build SemanticRouter if enabled or if embedding is available
-    semantic_router = None
-    semantic_conf = router_conf.get("semantic", {})
-    if semantic_conf.get("enabled", False):
-        try:
-            from agentkit.chat.semantic_router import SemanticRouter
-            from agentkit.memory.embedder import OpenAIEmbedder
-
-            # Try to get embedder from LLM gateway cache first
-            embedder = getattr(llm_gateway, "_embedder", None)
-
-            # If no cache embedder, create one directly from provider config
-            if embedder is None:
-                # Find a provider with an API key to use for embedding
-                for pname, pconf in server_config.llm_config.providers.items():
-                    if pconf.api_key:
-                        # Use correct base_url based on key prefix
-                        if pconf.api_key.startswith("sk-sp-"):
-                            base_url = pconf.base_url or "https://coding.dashscope.aliyuncs.com/v1"
-                        else:
-                            base_url = pconf.base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1"
-                        embedder = OpenAIEmbedder(
-                            api_key=pconf.api_key,
-                            base_url=base_url,
-                            model="text-embedding-v3",
-                        )
-                        print(f"Created embedder from provider '{pname}' (base_url={base_url})")
-                        break
-
-            if embedder is not None:
-                semantic_router = SemanticRouter(
-                    embedder=embedder,
-                    similarity_high=semantic_conf.get("similarity_high", 0.85),
-                    similarity_low=semantic_conf.get("similarity_low", 0.4),
-                )
-                # Build skill embedding index
-                import asyncio
-
-                try:
-                    loop = asyncio.get_running_loop()
-                except RuntimeError:
-                    loop = None
-
-                if loop and loop.is_running():
-                    # Already in async context (pytest-asyncio), schedule in background
-                    import concurrent.futures
-
-                    with concurrent.futures.ThreadPoolExecutor() as pool:
-                        pool.submit(asyncio.run, semantic_router.build_index(skill_registry)).result()
-                else:
-                    asyncio.run(semantic_router.build_index(skill_registry))
-                print(f"SemanticRouter built: {semantic_router._index.size} skills indexed")
-            else:
-                print("Warning: No embedder available for SemanticRouter")
-        except Exception as e:
-            print(f"Warning: SemanticRouter not available: {e}")
-
-    router = CostAwareRouter(
-        llm_gateway=llm_gateway,
-        model="default",
-        org_context=None,
-        auction_enabled=router_conf.get("auction_enabled", False),
-        classifier=router_conf.get("classifier", "heuristic"),
-        merged_llm_classify=router_conf.get("merged_llm_classify", True),
-        semantic_router=semantic_router,
-    )
-
-    return router, skill_registry, intent_router
-
-
-# Cache components at module level to avoid rebuilding for every test
-_cached_components: tuple[CostAwareRouter, SkillRegistry, IntentRouter] | None = None
-
-
-def _get_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRouter]:
-    """Get or build real components (cached for session)."""
-    global _cached_components
-    if _cached_components is None:
-        _cached_components = _build_real_components()
-    return _cached_components
-
-
-# ═══════════════════════════════════════════════════════════════════════════
-# Helper: Run a single benchmark through the real router
-# ═══════════════════════════════════════════════════════════════════════════
-
-
-async def _run_router_benchmark(
-    benchmark: BenchmarkCase,
-    collector: MetricsCollector,
-    test_name: str,
-    is_paraphrase: bool = False,
-    input_override: str | None = None,
-) -> dict:
-    """Run a single benchmark through the real router."""
-    router, skill_registry, intent_router = _get_components()
-    query = input_override or benchmark.input
-
-    collector.start_timer(benchmark.id)
-
-    try:
-        result = await router.route(
-            content=query,
-            skill_registry=skill_registry,
-            intent_router=intent_router,
-            default_tools=[],
-            default_system_prompt=None,
-        )
-
-        actual_skill = result.skill_name
-        actual_exec_mode = result.execution_mode.value if result.execution_mode else None
-        actual_complexity = result.complexity
-        actual_match_method = result.match_method
-        actual_match_confidence = result.match_confidence
-        task_succeeded = True
-        error_msg = None
-    except Exception as e:
-        actual_skill = None
-        actual_exec_mode = None
-        actual_complexity = 0.0
-        actual_match_method = None
-        actual_match_confidence = 0.0
-        task_succeeded = False
-        error_msg = str(e)[:200]
-
-    # Map complexity score to level
-    if actual_complexity < 0.3:
-        actual_complexity_level = "low"
-    elif actual_complexity < 0.7:
-        actual_complexity_level = "medium"
-    else:
-        actual_complexity_level = "high"
-
-    # Judge correctness
-    skill_correct = None
-    if benchmark.expected_skill is not None and actual_skill is not None:
-        skill_correct = actual_skill == benchmark.expected_skill
-    elif benchmark.expected_skill is None:
-        skill_correct = actual_skill is None or task_succeeded
-
-    execution_mode_correct = None
-    if actual_exec_mode is not None and benchmark.expected_execution_mode:
-        mode_map = {
-            "direct": "DIRECT_CHAT",
-            "react": "SKILL_REACT",
-            "rewoo": "REWOO",
-            "reflexion": "REFLEXION",
-            "plan_exec": "PLAN_EXEC",
-            "team_collab": "TEAM_COLLAB",
-            "llm_generate": "SKILL_REACT",
-            "tool_call": "SKILL_REACT",
-            "custom": "SKILL_REACT",
-        }
-        expected_normalized = mode_map.get(
-            benchmark.expected_execution_mode, benchmark.expected_execution_mode.upper()
-        )
-        execution_mode_correct = actual_exec_mode.upper() == expected_normalized
-
-    complexity_correct = actual_complexity_level == benchmark.expected_complexity
-
-    obs = collector.record_benchmark_result(
-        benchmark,
-        test_name=test_name,
-        actual_skill=actual_skill,
-        actual_execution_mode=actual_exec_mode,
-        actual_status_code=200 if task_succeeded else 500,
-        task_succeeded=task_succeeded,
-        is_paraphrase=is_paraphrase,
-        error_message=error_msg,
-    )
-    obs.complexity_correct = complexity_correct
-
-    return {
-        "skill_correct": skill_correct,
-        "execution_mode_correct": execution_mode_correct,
-        "complexity_correct": complexity_correct,
-        "actual_skill": actual_skill,
-        "actual_exec_mode": actual_exec_mode,
-        "actual_complexity": actual_complexity,
-        "actual_complexity_level": actual_complexity_level,
-        "actual_match_method": actual_match_method,
-        "actual_match_confidence": actual_match_confidence,
-        "task_succeeded": task_succeeded,
-    }
-
-
-# ═══════════════════════════════════════════════════════════════════════════
-# Layer 0: Rule Matching Tests
-# ═══════════════════════════════════════════════════════════════════════════
-
-
-@pytest.mark.e2e_capability
-class TestRouterLayer0:
-    """Test Layer 0 rule matching with real router."""
-
-    @pytest.mark.parametrize(
-        "benchmark",
-        [
-            b
-            for b in ROUTING_EDGE_BENCHMARKS
-            if b.subcategory in ("greeting", "identity", "explicit_prefix")
-        ],
-        ids=[
-            b.id
-            for b in ROUTING_EDGE_BENCHMARKS
-            if b.subcategory in ("greeting", "identity", "explicit_prefix")
-        ],
-    )
-    def test_layer0_rules(self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector):
-        """Layer 0 should correctly match greetings, identity, and @skill: prefix."""
-        result = asyncio.run(
-            _run_router_benchmark(benchmark, metrics_collector, f"layer0_{benchmark.id}")
-        )
-        if benchmark.subcategory == "greeting":
-            assert result["actual_match_method"] in ("layer0", None) or result["task_succeeded"]
-        if benchmark.subcategory == "explicit_prefix":
-            assert result["actual_skill"] == benchmark.expected_skill or result["task_succeeded"]
-
-
-# ═══════════════════════════════════════════════════════════════════════════
-# Layer 1: Complexity Classification Tests
-# ═══════════════════════════════════════════════════════════════════════════
-
-
-@pytest.mark.e2e_capability
-class TestRouterLayer1:
-    """Test Layer 1 complexity classification with real router."""
-
-    @pytest.mark.parametrize(
-        "benchmark",
-        ROUTING_KEYWORD_BENCHMARKS,
-        ids=[b.id for b in ROUTING_KEYWORD_BENCHMARKS],
-    )
-    def test_complexity_classification(
-        self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector
-    ):
-        """HeuristicClassifier should correctly estimate complexity."""
-        asyncio.run(_run_router_benchmark(benchmark, metrics_collector, f"layer1_{benchmark.id}"))
-
-
-# ═══════════════════════════════════════════════════════════════════════════
-# Semantic Router Tests
-# ═══════════════════════════════════════════════════════════════════════════
-
-
-@pytest.mark.e2e_capability
-class TestSemanticRouter:
-    """Test semantic router matching with real router."""
-
-    @pytest.mark.parametrize(
-        "benchmark",
-        SEMANTIC_ROUTER_BENCHMARKS,
-        ids=[b.id for b in SEMANTIC_ROUTER_BENCHMARKS],
-    )
-    def test_semantic_match(self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector):
-        """SemanticRouter should match skill descriptions."""
-        asyncio.run(_run_router_benchmark(benchmark, metrics_collector, f"semantic_{benchmark.id}"))
-
-
-# ═══════════════════════════════════════════════════════════════════════════
-# Paraphrase Consistency Tests (Overfitting Detection)
-# ═══════════════════════════════════════════════════════════════════════════
-
-
-@pytest.mark.e2e_capability
-class TestRouterParaphraseConsistency:
-    """Test that paraphrased inputs route to the same skill as originals."""
-
-    @pytest.mark.parametrize(
-        "benchmark",
-        [b for b in ALL_BENCHMARKS if b.paraphrases and b.expected_skill is not None][:10],
-        ids=[b.id for b in ALL_BENCHMARKS if b.paraphrases and b.expected_skill is not None][:10],
-    )
-    def test_paraphrase_routes_same_skill(
-        self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector
-    ):
-        """Original and paraphrased inputs should route to the same skill."""
-        # Run original
-        asyncio.run(
-            _run_router_benchmark(benchmark, metrics_collector, f"para_orig_{benchmark.id}")
-        )
-
-        # Run paraphrases
-        for i, para in enumerate(benchmark.paraphrases):
-            asyncio.run(
-                _run_router_benchmark(
-                    benchmark,
-                    metrics_collector,
-                    f"para_{benchmark.id}_{i}",
-                    is_paraphrase=True,
-                    input_override=para,
-                )
-            )