feat: comprehensive capability benchmark and agentkit benchmark CLI

This commit is contained in:
chiguyong 2026-06-17 11:28:09 +08:00
parent ecf87391a5
commit d00995504d
9 changed files with 3865 additions and 405 deletions

File diff suppressed because it is too large Load Diff

View File

@ -35,6 +35,10 @@ from agentkit.cli.chat import chat # noqa: E402
app.command(name="chat")(chat)
from agentkit.cli.benchmark import benchmark # noqa: E402
app.command(name="benchmark")(benchmark)
@app.command()
def gui(

View File

@ -0,0 +1,44 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>AgentKit Benchmark Report</title>
<style>
body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; margin: 2em; }
h1 { color: #1a1a2e; }
.meta { color: #666; margin-bottom: 1em; }
table { border-collapse: collapse; width: 100%; margin: 1em 0; }
th, td { border: 1px solid #ddd; padding: 8px 12px; text-align: left; }
th { background-color: #1a1a2e; color: white; }
td.num { text-align: right; font-family: monospace; }
td.pass { color: #2e7d32; }
td.fail { color: #c62828; }
.score-good { color: #2e7d32; font-weight: bold; }
.score-warn { color: #e65100; font-weight: bold; }
.score-bad { color: #c62828; font-weight: bold; }
.overall-row { background-color: #f5f5f5; }
.failure { margin: 0.5em 0; padding: 0.5em; background: #fff3e0; border-left: 3px solid #ff9800; }
.failure .dim { color: #e65100; font-weight: bold; }
.failure .case { font-family: monospace; }
.failure .detail { font-size: 0.85em; color: #555; margin-left: 1em; }
.all-pass { color: #2e7d32; font-weight: bold; }
</style>
</head>
<body>
<h1>AgentKit Benchmark Report</h1>
<div class="meta">
<p>Timestamp: 2026-06-17T03:26:25.072956+00:00</p>
<p>Version: 0.1.0</p>
<p>Overall Score: <strong>98.0%</strong></p>
<p>Summary: 50/51 tests passed (1 failed) across 7 dimensions.</p>
</div>
<h2>Dimension Results</h2>
<table>
<thead><tr><th>Dimension</th><th>Total</th><th>Pass</th><th>Fail</th><th>Score</th></tr></thead>
<tbody>
<tr><td>preprocessing</td><td class='num'>15</td><td class='num pass'>14</td><td class='num fail'>1</td><td class='num score-good'>93.3%</td></tr><tr><td>overfitting</td><td class='num'>3</td><td class='num pass'>3</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>efficiency</td><td class='num'>5</td><td class='num pass'>5</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>tool_search</td><td class='num'>10</td><td class='num pass'>10</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>event_model</td><td class='num'>6</td><td class='num pass'>6</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>spec_management</td><td class='num'>7</td><td class='num pass'>7</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>verification</td><td class='num'>5</td><td class='num pass'>5</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr class='overall-row'><td><strong>OVERALL</strong></td><td class='num'><strong>51</strong></td><td class='num pass'><strong>50</strong></td><td class='num fail'><strong>1</strong></td><td class='num score-good'><strong>98.0%</strong></td></tr>
</tbody>
</table>
<h2>Failed Cases</h2><div class='failure'><span class='dim'>[preprocessing]</span> <span class='case'>skill_prefix_direct</span><div class='detail'>expected: skill_react</div><div class='detail'>actual: direct_chat</div></div>
</body>
</html>

View File

@ -0,0 +1,472 @@
{
"timestamp": "2026-06-17T03:26:25.072956+00:00",
"version": "0.1.0",
"dimensions": {
"preprocessing": {
"score": 0.9333,
"total": 15,
"passed": 14,
"failed": 1,
"details": [
{
"case_id": "greeting_cn",
"passed": true,
"expected": "direct_chat",
"actual": "direct_chat",
"duration_ms": 0.03,
"detail": "input='你好' method=regex_direct"
},
{
"case_id": "greeting_en",
"passed": true,
"expected": "direct_chat",
"actual": "direct_chat",
"duration_ms": 0.02,
"detail": "input='hello' method=regex_direct"
},
{
"case_id": "chitchat_thanks",
"passed": true,
"expected": "direct_chat",
"actual": "direct_chat",
"duration_ms": 0.01,
"detail": "input='谢谢' method=regex_direct"
},
{
"case_id": "identity_who",
"passed": true,
"expected": "direct_chat",
"actual": "direct_chat",
"duration_ms": 0.02,
"detail": "input='你是谁' method=regex_direct"
},
{
"case_id": "colloquial_ip_1",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.02,
"detail": "input='查下ip' method=default_react"
},
{
"case_id": "colloquial_ip_2",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.01,
"detail": "input='查看当前ip' method=default_react"
},
{
"case_id": "tool_search",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.01,
"detail": "input='搜索golang教程' method=default_react"
},
{
"case_id": "tool_shell",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.01,
"detail": "input='执行ls命令' method=default_react"
},
{
"case_id": "translation",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.01,
"detail": "input='翻译hello为中文' method=default_react"
},
{
"case_id": "knowledge",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.01,
"detail": "input='什么是机器学习' method=default_react"
},
{
"case_id": "skill_prefix_react",
"passed": true,
"expected": "skill_react",
"actual": "skill_react",
"duration_ms": 0.03,
"detail": "input='@skill:react_agent 查看ip' method=skill_prefix"
},
{
"case_id": "skill_prefix_direct",
"passed": false,
"expected": "skill_react",
"actual": "direct_chat",
"duration_ms": 0.02,
"detail": "input='@skill:chat_only 你好' method=skill_prefix"
},
{
"case_id": "skill_not_found",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.13,
"detail": "input='@skill:nonexistent 做点什么' method=skill_not_found_fallback"
},
{
"case_id": "complex_analysis",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.01,
"detail": "input='帮我分析一下这个数据并生成报告' method=default_react"
},
{
"case_id": "empty_fallback",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.01,
"detail": "input='随便聊聊' method=default_react"
}
]
},
"overfitting": {
"score": 1.0,
"total": 3,
"passed": 3,
"failed": 0,
"details": [
{
"case_id": "ip_check_variants",
"passed": true,
"expected": "react",
"actual": "react,react,react,react,react",
"duration_ms": 0.0,
"detail": "paraphrases=5 consistent=True"
},
{
"case_id": "search_variants",
"passed": true,
"expected": "react",
"actual": "react,react,react",
"duration_ms": 0.0,
"detail": "paraphrases=3 consistent=True"
},
{
"case_id": "greeting_variants",
"passed": true,
"expected": "direct_chat",
"actual": "direct_chat,direct_chat,direct_chat,direct_chat,direct_chat",
"duration_ms": 0.0,
"detail": "paraphrases=5 consistent=True"
}
]
},
"efficiency": {
"score": 1.0,
"total": 5,
"passed": 5,
"failed": 0,
"details": [
{
"case_id": "preprocess_greeting",
"passed": true,
"expected": "<= 50.0ms/call",
"actual": "0.004ms/call",
"duration_ms": 0.44,
"detail": "iterations=100"
},
{
"case_id": "preprocess_react",
"passed": true,
"expected": "<= 50.0ms/call",
"actual": "0.004ms/call",
"duration_ms": 0.38,
"detail": "iterations=100"
},
{
"case_id": "preprocess_skill_prefix",
"passed": true,
"expected": "<= 50.0ms/call",
"actual": "0.005ms/call",
"duration_ms": 0.51,
"detail": "iterations=100"
},
{
"case_id": "tool_search_query",
"passed": true,
"expected": "<= 10.0ms/call",
"actual": "0.008ms/call",
"duration_ms": 1.69,
"detail": "iterations=200"
},
{
"case_id": "tool_search_empty",
"passed": true,
"expected": "<= 5.0ms/call",
"actual": "0.000ms/call",
"duration_ms": 0.08,
"detail": "iterations=200"
}
]
},
"tool_search": {
"score": 1.0,
"total": 10,
"passed": 10,
"failed": 0,
"details": [
{
"case_id": "read_file_query",
"passed": true,
"expected": "read_file",
"actual": "read_file",
"duration_ms": 0.02,
"detail": "query='read file' top_k=5 results=2"
},
{
"case_id": "write_file_query",
"passed": true,
"expected": "write_file",
"actual": "write_file",
"duration_ms": 0.02,
"detail": "query='write file content' top_k=5 results=2"
},
{
"case_id": "web_search_query",
"passed": true,
"expected": "web_search",
"actual": "web_search",
"duration_ms": 0.02,
"detail": "query='search web information' top_k=5 results=2"
},
{
"case_id": "shell_exec_query",
"passed": true,
"expected": "shell_exec",
"actual": "shell_exec",
"duration_ms": 0.02,
"detail": "query='execute shell command' top_k=5 results=1"
},
{
"case_id": "http_request_query",
"passed": true,
"expected": "http_request",
"actual": "http_request",
"duration_ms": 0.03,
"detail": "query='send http request url' top_k=5 results=1"
},
{
"case_id": "file_tag_query",
"passed": true,
"expected": "read_file",
"actual": "read_file",
"duration_ms": 0.02,
"detail": "query='io file' top_k=5 results=2"
},
{
"case_id": "empty_query",
"passed": true,
"expected": "__none__",
"actual": "[]",
"duration_ms": 0.0,
"detail": "query='' top_k=5 results=0"
},
{
"case_id": "no_match_query",
"passed": true,
"expected": "__none__",
"actual": "[]",
"duration_ms": 0.01,
"detail": "query='zzzznonexistent' top_k=5 results=0"
},
{
"case_id": "top_k_limit",
"passed": true,
"expected": "read_file",
"actual": "read_file",
"duration_ms": 0.02,
"detail": "query='file' top_k=1 results=1"
},
{
"case_id": "multi_token_query",
"passed": true,
"expected": "web_search",
"actual": "web_search",
"duration_ms": 0.03,
"detail": "query='search query engine' top_k=5 results=1"
}
]
},
"event_model": {
"score": 1.0,
"total": 6,
"passed": 6,
"failed": 0,
"details": [
{
"case_id": "sq_submit_drain",
"passed": true,
"expected": "task_id + drained=['hello']",
"actual": "task_id=571839fb... drained=['hello']",
"duration_ms": 0.1,
"detail": ""
},
{
"case_id": "sq_cancel",
"passed": true,
"expected": "cancelled=True",
"actual": "cancelled=True",
"duration_ms": 0.04,
"detail": ""
},
{
"case_id": "sq_close_blocks",
"passed": true,
"expected": "RuntimeError on submit after close",
"actual": "raised=True closed=True",
"duration_ms": 0.02,
"detail": ""
},
{
"case_id": "eq_emit_subscribe_replay",
"passed": true,
"expected": "1 event replayed",
"actual": "1 events",
"duration_ms": 0.07,
"detail": ""
},
{
"case_id": "eq_close_sentinel",
"passed": true,
"expected": "subscriber exits on close",
"actual": "1 events, closed=True",
"duration_ms": 21.59,
"detail": ""
},
{
"case_id": "eq_subscriber_count",
"passed": true,
"expected": "0 subscribers initially",
"actual": "0 subscribers",
"duration_ms": 0.01,
"detail": ""
}
]
},
"spec_management": {
"score": 1.0,
"total": 7,
"passed": 7,
"failed": 0,
"details": [
{
"case_id": "spec_create",
"passed": true,
"expected": "file exists on disk",
"actual": "exists=True",
"duration_ms": 2.24,
"detail": ""
},
{
"case_id": "spec_get",
"passed": true,
"expected": "spec with 2 steps",
"actual": "steps=2",
"duration_ms": 0.0,
"detail": ""
},
{
"case_id": "spec_update",
"passed": true,
"expected": "goal='Updated goal'",
"actual": "goal=Updated goal",
"duration_ms": 1.75,
"detail": ""
},
{
"case_id": "spec_confirm",
"passed": true,
"expected": "status=confirmed, all steps confirmed",
"actual": "status=confirmed",
"duration_ms": 1.86,
"detail": ""
},
{
"case_id": "spec_list",
"passed": true,
"expected": "2 specs",
"actual": "2 specs",
"duration_ms": 4.92,
"detail": ""
},
{
"case_id": "spec_delete",
"passed": true,
"expected": "deleted, 1 remaining",
"actual": "deleted=True, remaining=1",
"duration_ms": 1.94,
"detail": ""
},
{
"case_id": "spec_get_missing",
"passed": true,
"expected": "None",
"actual": "None",
"duration_ms": 0.06,
"detail": ""
}
]
},
"verification": {
"score": 1.0,
"total": 5,
"passed": 5,
"failed": 0,
"details": [
{
"case_id": "verify_pass",
"passed": true,
"expected": "passed=True, attempts=1",
"actual": "passed=True, attempts=1",
"duration_ms": 11.82,
"detail": ""
},
{
"case_id": "verify_fail",
"passed": true,
"expected": "passed=False, has errors",
"actual": "passed=False, errors=1",
"duration_ms": 9.8,
"detail": ""
},
{
"case_id": "verify_retry",
"passed": true,
"expected": "attempts=3, fix_callback called 2x",
"actual": "attempts=3, callbacks=2",
"duration_ms": 33.87,
"detail": ""
},
{
"case_id": "verify_timeout",
"passed": true,
"expected": "timeout error",
"actual": "passed=False, errors=1",
"duration_ms": 506.8,
"detail": ""
},
{
"case_id": "verify_multi_command",
"passed": true,
"expected": "overall fail, output has both commands",
"actual": "passed=False",
"duration_ms": 23.12,
"detail": ""
}
]
}
},
"overall_score": 0.9804,
"summary": "50/51 tests passed (1 failed) across 7 dimensions."
}

View File

@ -0,0 +1,28 @@
======================================================================
AgentKit Benchmark Report
======================================================================
Timestamp: 2026-06-17T03:26:25.072956+00:00
Version: 0.1.0
Overall Score: 98.0%
Summary: 50/51 tests passed (1 failed) across 7 dimensions.
----------------------------------------------------------------------
Dimension Total Pass Fail Score
----------------------------------------------------------------------
preprocessing 15 14 1 93.3%
overfitting 3 3 0 100.0%
efficiency 5 5 0 100.0%
tool_search 10 10 0 100.0%
event_model 6 6 0 100.0%
spec_management 7 7 0 100.0%
verification 5 5 0 100.0%
----------------------------------------------------------------------
OVERALL 51 50 1 98.0%
======================================================================
Failed Cases:
----------------------------------------------------------------------
[preprocessing] skill_prefix_direct
expected: skill_react
actual: direct_chat
detail: input='@skill:chat_only 你好' method=skill_prefix

View File

@ -0,0 +1,334 @@
{
"report_type": "comprehensive_capability_backtest",
"generated_at": "2026-06-17T03:22:42.152439+00:00",
"total_score": 100.0,
"total_cases": 50,
"total_passed": 50,
"dimension_scores": {
"preprocessing_accuracy": 100.0,
"skill_recall": 100.0,
"overfitting_detection": 100.0,
"execution_efficiency": 100.0,
"tool_search_accuracy": 100.0,
"event_model_integrity": 100.0,
"spec_management": 100.0,
"verification_loop": 100.0
},
"dimension_details": {
"preprocessing_accuracy": {
"total": 17,
"passed": 17,
"score": 100.0,
"cases": [
{
"case_id": "greeting_cn",
"passed": true,
"input": "你好",
"expected": "direct_chat",
"actual": "direct_chat"
},
{
"case_id": "greeting_en",
"passed": true,
"input": "hello",
"expected": "direct_chat",
"actual": "direct_chat"
},
{
"case_id": "greeting_hi",
"passed": true,
"input": "hi",
"expected": "direct_chat",
"actual": "direct_chat"
},
{
"case_id": "chitchat_thanks",
"passed": true,
"input": "谢谢",
"expected": "direct_chat",
"actual": "direct_chat"
},
{
"case_id": "chitchat_ok",
"passed": true,
"input": "好的",
"expected": "direct_chat",
"actual": "direct_chat"
},
{
"case_id": "identity_who",
"passed": true,
"input": "你是谁",
"expected": "direct_chat",
"actual": "direct_chat"
},
{
"case_id": "identity_name",
"passed": true,
"input": "你叫什么",
"expected": "direct_chat",
"actual": "direct_chat"
},
{
"case_id": "tool_ip",
"passed": true,
"input": "查下ip",
"expected": "react",
"actual": "react"
},
{
"case_id": "tool_search",
"passed": true,
"input": "搜索golang教程",
"expected": "react",
"actual": "react"
},
{
"case_id": "tool_shell",
"passed": true,
"input": "执行ls命令",
"expected": "react",
"actual": "react"
},
{
"case_id": "tool_file",
"passed": true,
"input": "读一下配置文件",
"expected": "react",
"actual": "react"
},
{
"case_id": "tool_monitor",
"passed": true,
"input": "检查服务状态",
"expected": "react",
"actual": "react"
},
{
"case_id": "complex_analysis",
"passed": true,
"input": "帮我分析一下这个数据并生成报告",
"expected": "react",
"actual": "react"
},
{
"case_id": "complex_code",
"passed": true,
"input": "重构这个函数使其更高效",
"expected": "react",
"actual": "react"
},
{
"case_id": "complex_multi",
"passed": true,
"input": "搜索最新的AI论文并总结关键发现",
"expected": "react",
"actual": "react"
},
{
"case_id": "skill_prefix_react",
"passed": true,
"input": "@skill:react_agent 查看当前ip",
"expected": "skill_react",
"actual": "skill_react"
},
{
"case_id": "skill_prefix_coder",
"passed": true,
"input": "@skill:coder 写一个函数",
"expected": "skill_react",
"actual": "skill_react"
}
]
},
"skill_recall": {
"total": 8,
"passed": 8,
"score": 100.0,
"cases": [
{
"case_id": "recall_valid_react",
"passed": true
},
{
"case_id": "recall_valid_coder",
"passed": true
},
{
"case_id": "recall_invalid_skill",
"passed": true
},
{
"case_id": "recall_no_prefix_react",
"passed": true
},
{
"case_id": "recall_no_prefix_greeting",
"passed": true
},
{
"case_id": "recall_no_prefix_complex",
"passed": true
},
{
"case_id": "recall_skill_only_prefix",
"passed": true
},
{
"case_id": "recall_skill_with_long_content",
"passed": true
}
]
},
"overfitting_detection": {
"total": 5,
"passed": 5,
"score": 100.0,
"cases": [
{
"case_id": "overfit_ip_check",
"passed": true
},
{
"case_id": "overfit_search",
"passed": true
},
{
"case_id": "overfit_greeting",
"passed": true
},
{
"case_id": "overfit_file_read",
"passed": true
},
{
"case_id": "overfit_identity",
"passed": true
}
]
},
"execution_efficiency": {
"total": 5,
"passed": 5,
"score": 100.0,
"cases": [
{
"case_id": "efficiency_greeting",
"passed": true,
"elapsed_ms": 0.41
},
{
"case_id": "efficiency_chitchat",
"passed": true,
"elapsed_ms": 0.47
},
{
"case_id": "efficiency_identity",
"passed": true,
"elapsed_ms": 0.48
},
{
"case_id": "efficiency_react_tool",
"passed": true,
"elapsed_ms": 0.49
},
{
"case_id": "efficiency_react_complex",
"passed": true,
"elapsed_ms": 0.55
}
]
},
"tool_search_accuracy": {
"total": 8,
"passed": 8,
"score": 100.0,
"cases": [
{
"case_id": "tool_search_read",
"passed": true
},
{
"case_id": "tool_search_write",
"passed": true
},
{
"case_id": "tool_search_web",
"passed": true
},
{
"case_id": "tool_search_shell",
"passed": true
},
{
"case_id": "tool_search_tests",
"passed": true
},
{
"case_id": "tool_search_file_multiple",
"passed": true
},
{
"case_id": "tool_search_no_match",
"passed": true
},
{
"case_id": "tool_search_empty_query",
"passed": true
}
]
},
"event_model_integrity": {
"total": 3,
"passed": 3,
"score": 100.0,
"cases": [
{
"case_id": "sq_submit_and_drain",
"passed": true
},
{
"case_id": "eq_emit_and_subscribe",
"passed": true
},
{
"case_id": "event_type_classification",
"passed": true
}
]
},
"spec_management": {
"total": 2,
"passed": 2,
"score": 100.0,
"cases": [
{
"case_id": "spec_create_and_get",
"passed": true
},
{
"case_id": "spec_confirm",
"passed": true
}
]
},
"verification_loop": {
"total": 2,
"passed": 2,
"score": 100.0,
"cases": [
{
"case_id": "verify_success",
"passed": true
},
{
"case_id": "verify_failure",
"passed": true
}
]
}
},
"suggestions": [
"所有维度均达到 100%,架构状态良好"
]
}

View File

@ -0,0 +1,95 @@
======================================================================
Fischer AgentKit 综合能力回测报告
======================================================================
生成时间: 2026-06-17T03:22:42.152439+00:00
总体评分: 100.0%
用例总数: 50 通过: 50 失败: 0
----------------------------------------------------------------------
各维度得分
----------------------------------------------------------------------
✓ 预处理准确度: 100.0% (17/17)
✓ 技能召回率: 100.0% (8/8)
✓ 过拟合检测: 100.0% (5/5)
✓ 执行效率: 100.0% (5/5)
✓ 工具搜索准确度: 100.0% (8/8)
✓ 事件模型完整性: 100.0% (3/3)
✓ Spec 管理功能: 100.0% (2/2)
✓ 验证循环: 100.0% (2/2)
----------------------------------------------------------------------
详细用例结果
----------------------------------------------------------------------
[预处理准确度]
✓ greeting_cn
✓ greeting_en
✓ greeting_hi
✓ chitchat_thanks
✓ chitchat_ok
✓ identity_who
✓ identity_name
✓ tool_ip
✓ tool_search
✓ tool_shell
✓ tool_file
✓ tool_monitor
✓ complex_analysis
✓ complex_code
✓ complex_multi
✓ skill_prefix_react
✓ skill_prefix_coder
[技能召回率]
✓ recall_valid_react
✓ recall_valid_coder
✓ recall_invalid_skill
✓ recall_no_prefix_react
✓ recall_no_prefix_greeting
✓ recall_no_prefix_complex
✓ recall_skill_only_prefix
✓ recall_skill_with_long_content
[过拟合检测]
✓ overfit_ip_check
✓ overfit_search
✓ overfit_greeting
✓ overfit_file_read
✓ overfit_identity
[执行效率]
✓ efficiency_greeting
✓ efficiency_chitchat
✓ efficiency_identity
✓ efficiency_react_tool
✓ efficiency_react_complex
[工具搜索准确度]
✓ tool_search_read
✓ tool_search_write
✓ tool_search_web
✓ tool_search_shell
✓ tool_search_tests
✓ tool_search_file_multiple
✓ tool_search_no_match
✓ tool_search_empty_query
[事件模型完整性]
✓ sq_submit_and_drain
✓ eq_emit_and_subscribe
✓ event_type_classification
[Spec 管理功能]
✓ spec_create_and_get
✓ spec_confirm
[验证循环]
✓ verify_success
✓ verify_failure
----------------------------------------------------------------------
改进建议
----------------------------------------------------------------------
• 所有维度均达到 100%,架构状态良好
======================================================================

File diff suppressed because it is too large Load Diff

View File

@ -1,405 +0,0 @@
"""E2E Agent Capability Tests — Router Direct Backtest Layer (Real LLM).
Directly tests CostAwareRouter.route() using real LLM configuration
loaded from agentkit.yaml. Records full SkillRoutingResult for precise
root cause analysis:
- match_method (layer0/layer1/layer1.5/layer2)
- match_confidence
- complexity score
- execution_trace
"""
import asyncio
import os
from pathlib import Path
import pytest
from agentkit.chat.skill_routing import CostAwareRouter
from agentkit.router.intent import IntentRouter
from agentkit.server.app import _build_llm_gateway, _build_skill_registry
from agentkit.server.config import ServerConfig
from agentkit.skills.registry import SkillRegistry
from tests.e2e.benchmark_dataset import (
ALL_BENCHMARKS,
ROUTING_KEYWORD_BENCHMARKS,
ROUTING_EDGE_BENCHMARKS,
SEMANTIC_ROUTER_BENCHMARKS,
BenchmarkCase,
)
from tests.e2e.capability_metrics import MetricsCollector
# ═══════════════════════════════════════════════════════════════════════════
# Real component initialization from agentkit.yaml
# ═══════════════════════════════════════════════════════════════════════════
def _find_config_path() -> str | None:
"""Find agentkit.yaml in standard search paths."""
candidates = [
os.environ.get("AGENTKIT_CONFIG", ""),
str(Path.cwd() / "agentkit.yaml"),
str(Path.home() / ".agentkit" / "agentkit.yaml"),
]
for path in candidates:
if path and Path(path).is_file():
return path
return None
def _build_real_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRouter]:
"""Build real components from agentkit.yaml configuration.
Returns (router, skill_registry, intent_router).
Raises skip if no valid LLM provider is configured.
"""
config_path = _find_config_path()
if not config_path:
pytest.skip("No agentkit.yaml found — cannot build real components")
# Load .env if present
env_path = Path(config_path).parent / ".env"
if env_path.exists():
try:
from dotenv import load_dotenv
load_dotenv(env_path)
except ImportError:
# python-dotenv not installed, manually parse .env
with open(env_path) as f:
for line in f:
line = line.strip()
if line and not line.startswith("#") and "=" in line:
key, _, value = line.partition("=")
os.environ.setdefault(key.strip(), value.strip().strip("'\""))
server_config = ServerConfig.from_yaml(config_path)
# Check if any LLM provider has a valid API key
if not server_config.has_llm_provider():
# Try to inject DASHSCOPE_API_KEY from environment
dashscope_key = os.environ.get("DASHSCOPE_API_KEY", "")
if dashscope_key:
# Inject into the test provider config
for name, pconf in server_config.llm_config.providers.items():
if not pconf.api_key:
pconf.api_key = dashscope_key
# Set base_url for dashscope if missing
# Use coding base_url for bailian-coding keys (sk-sp-* prefix)
if not pconf.base_url:
if dashscope_key.startswith("sk-sp-"):
pconf.base_url = "https://coding.dashscope.aliyuncs.com/v1"
else:
pconf.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
break
if not server_config.has_llm_provider():
pytest.skip("No LLM provider with valid API key — skipping real LLM tests")
# Build real LLM gateway
llm_gateway = _build_llm_gateway(server_config)
# Build real skill registry from configs/skills
skill_registry = _build_skill_registry(server_config)
# Build real intent router
intent_router = IntentRouter(llm_gateway=llm_gateway)
# Build real CostAwareRouter
router_conf = server_config.router or {}
# Build SemanticRouter if enabled or if embedding is available
semantic_router = None
semantic_conf = router_conf.get("semantic", {})
if semantic_conf.get("enabled", False):
try:
from agentkit.chat.semantic_router import SemanticRouter
from agentkit.memory.embedder import OpenAIEmbedder
# Try to get embedder from LLM gateway cache first
embedder = getattr(llm_gateway, "_embedder", None)
# If no cache embedder, create one directly from provider config
if embedder is None:
# Find a provider with an API key to use for embedding
for pname, pconf in server_config.llm_config.providers.items():
if pconf.api_key:
# Use correct base_url based on key prefix
if pconf.api_key.startswith("sk-sp-"):
base_url = pconf.base_url or "https://coding.dashscope.aliyuncs.com/v1"
else:
base_url = pconf.base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1"
embedder = OpenAIEmbedder(
api_key=pconf.api_key,
base_url=base_url,
model="text-embedding-v3",
)
print(f"Created embedder from provider '{pname}' (base_url={base_url})")
break
if embedder is not None:
semantic_router = SemanticRouter(
embedder=embedder,
similarity_high=semantic_conf.get("similarity_high", 0.85),
similarity_low=semantic_conf.get("similarity_low", 0.4),
)
# Build skill embedding index
import asyncio
try:
loop = asyncio.get_running_loop()
except RuntimeError:
loop = None
if loop and loop.is_running():
# Already in async context (pytest-asyncio), schedule in background
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor() as pool:
pool.submit(asyncio.run, semantic_router.build_index(skill_registry)).result()
else:
asyncio.run(semantic_router.build_index(skill_registry))
print(f"SemanticRouter built: {semantic_router._index.size} skills indexed")
else:
print("Warning: No embedder available for SemanticRouter")
except Exception as e:
print(f"Warning: SemanticRouter not available: {e}")
router = CostAwareRouter(
llm_gateway=llm_gateway,
model="default",
org_context=None,
auction_enabled=router_conf.get("auction_enabled", False),
classifier=router_conf.get("classifier", "heuristic"),
merged_llm_classify=router_conf.get("merged_llm_classify", True),
semantic_router=semantic_router,
)
return router, skill_registry, intent_router
# Cache components at module level to avoid rebuilding for every test
_cached_components: tuple[CostAwareRouter, SkillRegistry, IntentRouter] | None = None
def _get_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRouter]:
"""Get or build real components (cached for session)."""
global _cached_components
if _cached_components is None:
_cached_components = _build_real_components()
return _cached_components
# ═══════════════════════════════════════════════════════════════════════════
# Helper: Run a single benchmark through the real router
# ═══════════════════════════════════════════════════════════════════════════
async def _run_router_benchmark(
benchmark: BenchmarkCase,
collector: MetricsCollector,
test_name: str,
is_paraphrase: bool = False,
input_override: str | None = None,
) -> dict:
"""Run a single benchmark through the real router."""
router, skill_registry, intent_router = _get_components()
query = input_override or benchmark.input
collector.start_timer(benchmark.id)
try:
result = await router.route(
content=query,
skill_registry=skill_registry,
intent_router=intent_router,
default_tools=[],
default_system_prompt=None,
)
actual_skill = result.skill_name
actual_exec_mode = result.execution_mode.value if result.execution_mode else None
actual_complexity = result.complexity
actual_match_method = result.match_method
actual_match_confidence = result.match_confidence
task_succeeded = True
error_msg = None
except Exception as e:
actual_skill = None
actual_exec_mode = None
actual_complexity = 0.0
actual_match_method = None
actual_match_confidence = 0.0
task_succeeded = False
error_msg = str(e)[:200]
# Map complexity score to level
if actual_complexity < 0.3:
actual_complexity_level = "low"
elif actual_complexity < 0.7:
actual_complexity_level = "medium"
else:
actual_complexity_level = "high"
# Judge correctness
skill_correct = None
if benchmark.expected_skill is not None and actual_skill is not None:
skill_correct = actual_skill == benchmark.expected_skill
elif benchmark.expected_skill is None:
skill_correct = actual_skill is None or task_succeeded
execution_mode_correct = None
if actual_exec_mode is not None and benchmark.expected_execution_mode:
mode_map = {
"direct": "DIRECT_CHAT",
"react": "SKILL_REACT",
"rewoo": "REWOO",
"reflexion": "REFLEXION",
"plan_exec": "PLAN_EXEC",
"team_collab": "TEAM_COLLAB",
"llm_generate": "SKILL_REACT",
"tool_call": "SKILL_REACT",
"custom": "SKILL_REACT",
}
expected_normalized = mode_map.get(
benchmark.expected_execution_mode, benchmark.expected_execution_mode.upper()
)
execution_mode_correct = actual_exec_mode.upper() == expected_normalized
complexity_correct = actual_complexity_level == benchmark.expected_complexity
obs = collector.record_benchmark_result(
benchmark,
test_name=test_name,
actual_skill=actual_skill,
actual_execution_mode=actual_exec_mode,
actual_status_code=200 if task_succeeded else 500,
task_succeeded=task_succeeded,
is_paraphrase=is_paraphrase,
error_message=error_msg,
)
obs.complexity_correct = complexity_correct
return {
"skill_correct": skill_correct,
"execution_mode_correct": execution_mode_correct,
"complexity_correct": complexity_correct,
"actual_skill": actual_skill,
"actual_exec_mode": actual_exec_mode,
"actual_complexity": actual_complexity,
"actual_complexity_level": actual_complexity_level,
"actual_match_method": actual_match_method,
"actual_match_confidence": actual_match_confidence,
"task_succeeded": task_succeeded,
}
# ═══════════════════════════════════════════════════════════════════════════
# Layer 0: Rule Matching Tests
# ═══════════════════════════════════════════════════════════════════════════
@pytest.mark.e2e_capability
class TestRouterLayer0:
"""Test Layer 0 rule matching with real router."""
@pytest.mark.parametrize(
"benchmark",
[
b
for b in ROUTING_EDGE_BENCHMARKS
if b.subcategory in ("greeting", "identity", "explicit_prefix")
],
ids=[
b.id
for b in ROUTING_EDGE_BENCHMARKS
if b.subcategory in ("greeting", "identity", "explicit_prefix")
],
)
def test_layer0_rules(self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector):
"""Layer 0 should correctly match greetings, identity, and @skill: prefix."""
result = asyncio.run(
_run_router_benchmark(benchmark, metrics_collector, f"layer0_{benchmark.id}")
)
if benchmark.subcategory == "greeting":
assert result["actual_match_method"] in ("layer0", None) or result["task_succeeded"]
if benchmark.subcategory == "explicit_prefix":
assert result["actual_skill"] == benchmark.expected_skill or result["task_succeeded"]
# ═══════════════════════════════════════════════════════════════════════════
# Layer 1: Complexity Classification Tests
# ═══════════════════════════════════════════════════════════════════════════
@pytest.mark.e2e_capability
class TestRouterLayer1:
"""Test Layer 1 complexity classification with real router."""
@pytest.mark.parametrize(
"benchmark",
ROUTING_KEYWORD_BENCHMARKS,
ids=[b.id for b in ROUTING_KEYWORD_BENCHMARKS],
)
def test_complexity_classification(
self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector
):
"""HeuristicClassifier should correctly estimate complexity."""
asyncio.run(_run_router_benchmark(benchmark, metrics_collector, f"layer1_{benchmark.id}"))
# ═══════════════════════════════════════════════════════════════════════════
# Semantic Router Tests
# ═══════════════════════════════════════════════════════════════════════════
@pytest.mark.e2e_capability
class TestSemanticRouter:
"""Test semantic router matching with real router."""
@pytest.mark.parametrize(
"benchmark",
SEMANTIC_ROUTER_BENCHMARKS,
ids=[b.id for b in SEMANTIC_ROUTER_BENCHMARKS],
)
def test_semantic_match(self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector):
"""SemanticRouter should match skill descriptions."""
asyncio.run(_run_router_benchmark(benchmark, metrics_collector, f"semantic_{benchmark.id}"))
# ═══════════════════════════════════════════════════════════════════════════
# Paraphrase Consistency Tests (Overfitting Detection)
# ═══════════════════════════════════════════════════════════════════════════
@pytest.mark.e2e_capability
class TestRouterParaphraseConsistency:
"""Test that paraphrased inputs route to the same skill as originals."""
@pytest.mark.parametrize(
"benchmark",
[b for b in ALL_BENCHMARKS if b.paraphrases and b.expected_skill is not None][:10],
ids=[b.id for b in ALL_BENCHMARKS if b.paraphrases and b.expected_skill is not None][:10],
)
def test_paraphrase_routes_same_skill(
self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector
):
"""Original and paraphrased inputs should route to the same skill."""
# Run original
asyncio.run(
_run_router_benchmark(benchmark, metrics_collector, f"para_orig_{benchmark.id}")
)
# Run paraphrases
for i, para in enumerate(benchmark.paraphrases):
asyncio.run(
_run_router_benchmark(
benchmark,
metrics_collector,
f"para_{benchmark.id}_{i}",
is_paraphrase=True,
input_override=para,
)
)