feat: comprehensive capability benchmark and agentkit benchmark CLI
This commit is contained in:
parent
ecf87391a5
commit
d00995504d
File diff suppressed because it is too large
Load Diff
|
|
@ -35,6 +35,10 @@ from agentkit.cli.chat import chat # noqa: E402
|
|||
|
||||
app.command(name="chat")(chat)
|
||||
|
||||
from agentkit.cli.benchmark import benchmark # noqa: E402
|
||||
|
||||
app.command(name="benchmark")(benchmark)
|
||||
|
||||
|
||||
@app.command()
|
||||
def gui(
|
||||
|
|
|
|||
|
|
@ -0,0 +1,44 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>AgentKit Benchmark Report</title>
|
||||
<style>
|
||||
body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; margin: 2em; }
|
||||
h1 { color: #1a1a2e; }
|
||||
.meta { color: #666; margin-bottom: 1em; }
|
||||
table { border-collapse: collapse; width: 100%; margin: 1em 0; }
|
||||
th, td { border: 1px solid #ddd; padding: 8px 12px; text-align: left; }
|
||||
th { background-color: #1a1a2e; color: white; }
|
||||
td.num { text-align: right; font-family: monospace; }
|
||||
td.pass { color: #2e7d32; }
|
||||
td.fail { color: #c62828; }
|
||||
.score-good { color: #2e7d32; font-weight: bold; }
|
||||
.score-warn { color: #e65100; font-weight: bold; }
|
||||
.score-bad { color: #c62828; font-weight: bold; }
|
||||
.overall-row { background-color: #f5f5f5; }
|
||||
.failure { margin: 0.5em 0; padding: 0.5em; background: #fff3e0; border-left: 3px solid #ff9800; }
|
||||
.failure .dim { color: #e65100; font-weight: bold; }
|
||||
.failure .case { font-family: monospace; }
|
||||
.failure .detail { font-size: 0.85em; color: #555; margin-left: 1em; }
|
||||
.all-pass { color: #2e7d32; font-weight: bold; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>AgentKit Benchmark Report</h1>
|
||||
<div class="meta">
|
||||
<p>Timestamp: 2026-06-17T03:26:25.072956+00:00</p>
|
||||
<p>Version: 0.1.0</p>
|
||||
<p>Overall Score: <strong>98.0%</strong></p>
|
||||
<p>Summary: 50/51 tests passed (1 failed) across 7 dimensions.</p>
|
||||
</div>
|
||||
<h2>Dimension Results</h2>
|
||||
<table>
|
||||
<thead><tr><th>Dimension</th><th>Total</th><th>Pass</th><th>Fail</th><th>Score</th></tr></thead>
|
||||
<tbody>
|
||||
<tr><td>preprocessing</td><td class='num'>15</td><td class='num pass'>14</td><td class='num fail'>1</td><td class='num score-good'>93.3%</td></tr><tr><td>overfitting</td><td class='num'>3</td><td class='num pass'>3</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>efficiency</td><td class='num'>5</td><td class='num pass'>5</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>tool_search</td><td class='num'>10</td><td class='num pass'>10</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>event_model</td><td class='num'>6</td><td class='num pass'>6</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>spec_management</td><td class='num'>7</td><td class='num pass'>7</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>verification</td><td class='num'>5</td><td class='num pass'>5</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr class='overall-row'><td><strong>OVERALL</strong></td><td class='num'><strong>51</strong></td><td class='num pass'><strong>50</strong></td><td class='num fail'><strong>1</strong></td><td class='num score-good'><strong>98.0%</strong></td></tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<h2>Failed Cases</h2><div class='failure'><span class='dim'>[preprocessing]</span> <span class='case'>skill_prefix_direct</span><div class='detail'>expected: skill_react</div><div class='detail'>actual: direct_chat</div></div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
@ -0,0 +1,472 @@
|
|||
{
|
||||
"timestamp": "2026-06-17T03:26:25.072956+00:00",
|
||||
"version": "0.1.0",
|
||||
"dimensions": {
|
||||
"preprocessing": {
|
||||
"score": 0.9333,
|
||||
"total": 15,
|
||||
"passed": 14,
|
||||
"failed": 1,
|
||||
"details": [
|
||||
{
|
||||
"case_id": "greeting_cn",
|
||||
"passed": true,
|
||||
"expected": "direct_chat",
|
||||
"actual": "direct_chat",
|
||||
"duration_ms": 0.03,
|
||||
"detail": "input='你好' method=regex_direct"
|
||||
},
|
||||
{
|
||||
"case_id": "greeting_en",
|
||||
"passed": true,
|
||||
"expected": "direct_chat",
|
||||
"actual": "direct_chat",
|
||||
"duration_ms": 0.02,
|
||||
"detail": "input='hello' method=regex_direct"
|
||||
},
|
||||
{
|
||||
"case_id": "chitchat_thanks",
|
||||
"passed": true,
|
||||
"expected": "direct_chat",
|
||||
"actual": "direct_chat",
|
||||
"duration_ms": 0.01,
|
||||
"detail": "input='谢谢' method=regex_direct"
|
||||
},
|
||||
{
|
||||
"case_id": "identity_who",
|
||||
"passed": true,
|
||||
"expected": "direct_chat",
|
||||
"actual": "direct_chat",
|
||||
"duration_ms": 0.02,
|
||||
"detail": "input='你是谁' method=regex_direct"
|
||||
},
|
||||
{
|
||||
"case_id": "colloquial_ip_1",
|
||||
"passed": true,
|
||||
"expected": "react",
|
||||
"actual": "react",
|
||||
"duration_ms": 0.02,
|
||||
"detail": "input='查下ip' method=default_react"
|
||||
},
|
||||
{
|
||||
"case_id": "colloquial_ip_2",
|
||||
"passed": true,
|
||||
"expected": "react",
|
||||
"actual": "react",
|
||||
"duration_ms": 0.01,
|
||||
"detail": "input='查看当前ip' method=default_react"
|
||||
},
|
||||
{
|
||||
"case_id": "tool_search",
|
||||
"passed": true,
|
||||
"expected": "react",
|
||||
"actual": "react",
|
||||
"duration_ms": 0.01,
|
||||
"detail": "input='搜索golang教程' method=default_react"
|
||||
},
|
||||
{
|
||||
"case_id": "tool_shell",
|
||||
"passed": true,
|
||||
"expected": "react",
|
||||
"actual": "react",
|
||||
"duration_ms": 0.01,
|
||||
"detail": "input='执行ls命令' method=default_react"
|
||||
},
|
||||
{
|
||||
"case_id": "translation",
|
||||
"passed": true,
|
||||
"expected": "react",
|
||||
"actual": "react",
|
||||
"duration_ms": 0.01,
|
||||
"detail": "input='翻译hello为中文' method=default_react"
|
||||
},
|
||||
{
|
||||
"case_id": "knowledge",
|
||||
"passed": true,
|
||||
"expected": "react",
|
||||
"actual": "react",
|
||||
"duration_ms": 0.01,
|
||||
"detail": "input='什么是机器学习' method=default_react"
|
||||
},
|
||||
{
|
||||
"case_id": "skill_prefix_react",
|
||||
"passed": true,
|
||||
"expected": "skill_react",
|
||||
"actual": "skill_react",
|
||||
"duration_ms": 0.03,
|
||||
"detail": "input='@skill:react_agent 查看ip' method=skill_prefix"
|
||||
},
|
||||
{
|
||||
"case_id": "skill_prefix_direct",
|
||||
"passed": false,
|
||||
"expected": "skill_react",
|
||||
"actual": "direct_chat",
|
||||
"duration_ms": 0.02,
|
||||
"detail": "input='@skill:chat_only 你好' method=skill_prefix"
|
||||
},
|
||||
{
|
||||
"case_id": "skill_not_found",
|
||||
"passed": true,
|
||||
"expected": "react",
|
||||
"actual": "react",
|
||||
"duration_ms": 0.13,
|
||||
"detail": "input='@skill:nonexistent 做点什么' method=skill_not_found_fallback"
|
||||
},
|
||||
{
|
||||
"case_id": "complex_analysis",
|
||||
"passed": true,
|
||||
"expected": "react",
|
||||
"actual": "react",
|
||||
"duration_ms": 0.01,
|
||||
"detail": "input='帮我分析一下这个数据并生成报告' method=default_react"
|
||||
},
|
||||
{
|
||||
"case_id": "empty_fallback",
|
||||
"passed": true,
|
||||
"expected": "react",
|
||||
"actual": "react",
|
||||
"duration_ms": 0.01,
|
||||
"detail": "input='随便聊聊' method=default_react"
|
||||
}
|
||||
]
|
||||
},
|
||||
"overfitting": {
|
||||
"score": 1.0,
|
||||
"total": 3,
|
||||
"passed": 3,
|
||||
"failed": 0,
|
||||
"details": [
|
||||
{
|
||||
"case_id": "ip_check_variants",
|
||||
"passed": true,
|
||||
"expected": "react",
|
||||
"actual": "react,react,react,react,react",
|
||||
"duration_ms": 0.0,
|
||||
"detail": "paraphrases=5 consistent=True"
|
||||
},
|
||||
{
|
||||
"case_id": "search_variants",
|
||||
"passed": true,
|
||||
"expected": "react",
|
||||
"actual": "react,react,react",
|
||||
"duration_ms": 0.0,
|
||||
"detail": "paraphrases=3 consistent=True"
|
||||
},
|
||||
{
|
||||
"case_id": "greeting_variants",
|
||||
"passed": true,
|
||||
"expected": "direct_chat",
|
||||
"actual": "direct_chat,direct_chat,direct_chat,direct_chat,direct_chat",
|
||||
"duration_ms": 0.0,
|
||||
"detail": "paraphrases=5 consistent=True"
|
||||
}
|
||||
]
|
||||
},
|
||||
"efficiency": {
|
||||
"score": 1.0,
|
||||
"total": 5,
|
||||
"passed": 5,
|
||||
"failed": 0,
|
||||
"details": [
|
||||
{
|
||||
"case_id": "preprocess_greeting",
|
||||
"passed": true,
|
||||
"expected": "<= 50.0ms/call",
|
||||
"actual": "0.004ms/call",
|
||||
"duration_ms": 0.44,
|
||||
"detail": "iterations=100"
|
||||
},
|
||||
{
|
||||
"case_id": "preprocess_react",
|
||||
"passed": true,
|
||||
"expected": "<= 50.0ms/call",
|
||||
"actual": "0.004ms/call",
|
||||
"duration_ms": 0.38,
|
||||
"detail": "iterations=100"
|
||||
},
|
||||
{
|
||||
"case_id": "preprocess_skill_prefix",
|
||||
"passed": true,
|
||||
"expected": "<= 50.0ms/call",
|
||||
"actual": "0.005ms/call",
|
||||
"duration_ms": 0.51,
|
||||
"detail": "iterations=100"
|
||||
},
|
||||
{
|
||||
"case_id": "tool_search_query",
|
||||
"passed": true,
|
||||
"expected": "<= 10.0ms/call",
|
||||
"actual": "0.008ms/call",
|
||||
"duration_ms": 1.69,
|
||||
"detail": "iterations=200"
|
||||
},
|
||||
{
|
||||
"case_id": "tool_search_empty",
|
||||
"passed": true,
|
||||
"expected": "<= 5.0ms/call",
|
||||
"actual": "0.000ms/call",
|
||||
"duration_ms": 0.08,
|
||||
"detail": "iterations=200"
|
||||
}
|
||||
]
|
||||
},
|
||||
"tool_search": {
|
||||
"score": 1.0,
|
||||
"total": 10,
|
||||
"passed": 10,
|
||||
"failed": 0,
|
||||
"details": [
|
||||
{
|
||||
"case_id": "read_file_query",
|
||||
"passed": true,
|
||||
"expected": "read_file",
|
||||
"actual": "read_file",
|
||||
"duration_ms": 0.02,
|
||||
"detail": "query='read file' top_k=5 results=2"
|
||||
},
|
||||
{
|
||||
"case_id": "write_file_query",
|
||||
"passed": true,
|
||||
"expected": "write_file",
|
||||
"actual": "write_file",
|
||||
"duration_ms": 0.02,
|
||||
"detail": "query='write file content' top_k=5 results=2"
|
||||
},
|
||||
{
|
||||
"case_id": "web_search_query",
|
||||
"passed": true,
|
||||
"expected": "web_search",
|
||||
"actual": "web_search",
|
||||
"duration_ms": 0.02,
|
||||
"detail": "query='search web information' top_k=5 results=2"
|
||||
},
|
||||
{
|
||||
"case_id": "shell_exec_query",
|
||||
"passed": true,
|
||||
"expected": "shell_exec",
|
||||
"actual": "shell_exec",
|
||||
"duration_ms": 0.02,
|
||||
"detail": "query='execute shell command' top_k=5 results=1"
|
||||
},
|
||||
{
|
||||
"case_id": "http_request_query",
|
||||
"passed": true,
|
||||
"expected": "http_request",
|
||||
"actual": "http_request",
|
||||
"duration_ms": 0.03,
|
||||
"detail": "query='send http request url' top_k=5 results=1"
|
||||
},
|
||||
{
|
||||
"case_id": "file_tag_query",
|
||||
"passed": true,
|
||||
"expected": "read_file",
|
||||
"actual": "read_file",
|
||||
"duration_ms": 0.02,
|
||||
"detail": "query='io file' top_k=5 results=2"
|
||||
},
|
||||
{
|
||||
"case_id": "empty_query",
|
||||
"passed": true,
|
||||
"expected": "__none__",
|
||||
"actual": "[]",
|
||||
"duration_ms": 0.0,
|
||||
"detail": "query='' top_k=5 results=0"
|
||||
},
|
||||
{
|
||||
"case_id": "no_match_query",
|
||||
"passed": true,
|
||||
"expected": "__none__",
|
||||
"actual": "[]",
|
||||
"duration_ms": 0.01,
|
||||
"detail": "query='zzzznonexistent' top_k=5 results=0"
|
||||
},
|
||||
{
|
||||
"case_id": "top_k_limit",
|
||||
"passed": true,
|
||||
"expected": "read_file",
|
||||
"actual": "read_file",
|
||||
"duration_ms": 0.02,
|
||||
"detail": "query='file' top_k=1 results=1"
|
||||
},
|
||||
{
|
||||
"case_id": "multi_token_query",
|
||||
"passed": true,
|
||||
"expected": "web_search",
|
||||
"actual": "web_search",
|
||||
"duration_ms": 0.03,
|
||||
"detail": "query='search query engine' top_k=5 results=1"
|
||||
}
|
||||
]
|
||||
},
|
||||
"event_model": {
|
||||
"score": 1.0,
|
||||
"total": 6,
|
||||
"passed": 6,
|
||||
"failed": 0,
|
||||
"details": [
|
||||
{
|
||||
"case_id": "sq_submit_drain",
|
||||
"passed": true,
|
||||
"expected": "task_id + drained=['hello']",
|
||||
"actual": "task_id=571839fb... drained=['hello']",
|
||||
"duration_ms": 0.1,
|
||||
"detail": ""
|
||||
},
|
||||
{
|
||||
"case_id": "sq_cancel",
|
||||
"passed": true,
|
||||
"expected": "cancelled=True",
|
||||
"actual": "cancelled=True",
|
||||
"duration_ms": 0.04,
|
||||
"detail": ""
|
||||
},
|
||||
{
|
||||
"case_id": "sq_close_blocks",
|
||||
"passed": true,
|
||||
"expected": "RuntimeError on submit after close",
|
||||
"actual": "raised=True closed=True",
|
||||
"duration_ms": 0.02,
|
||||
"detail": ""
|
||||
},
|
||||
{
|
||||
"case_id": "eq_emit_subscribe_replay",
|
||||
"passed": true,
|
||||
"expected": "1 event replayed",
|
||||
"actual": "1 events",
|
||||
"duration_ms": 0.07,
|
||||
"detail": ""
|
||||
},
|
||||
{
|
||||
"case_id": "eq_close_sentinel",
|
||||
"passed": true,
|
||||
"expected": "subscriber exits on close",
|
||||
"actual": "1 events, closed=True",
|
||||
"duration_ms": 21.59,
|
||||
"detail": ""
|
||||
},
|
||||
{
|
||||
"case_id": "eq_subscriber_count",
|
||||
"passed": true,
|
||||
"expected": "0 subscribers initially",
|
||||
"actual": "0 subscribers",
|
||||
"duration_ms": 0.01,
|
||||
"detail": ""
|
||||
}
|
||||
]
|
||||
},
|
||||
"spec_management": {
|
||||
"score": 1.0,
|
||||
"total": 7,
|
||||
"passed": 7,
|
||||
"failed": 0,
|
||||
"details": [
|
||||
{
|
||||
"case_id": "spec_create",
|
||||
"passed": true,
|
||||
"expected": "file exists on disk",
|
||||
"actual": "exists=True",
|
||||
"duration_ms": 2.24,
|
||||
"detail": ""
|
||||
},
|
||||
{
|
||||
"case_id": "spec_get",
|
||||
"passed": true,
|
||||
"expected": "spec with 2 steps",
|
||||
"actual": "steps=2",
|
||||
"duration_ms": 0.0,
|
||||
"detail": ""
|
||||
},
|
||||
{
|
||||
"case_id": "spec_update",
|
||||
"passed": true,
|
||||
"expected": "goal='Updated goal'",
|
||||
"actual": "goal=Updated goal",
|
||||
"duration_ms": 1.75,
|
||||
"detail": ""
|
||||
},
|
||||
{
|
||||
"case_id": "spec_confirm",
|
||||
"passed": true,
|
||||
"expected": "status=confirmed, all steps confirmed",
|
||||
"actual": "status=confirmed",
|
||||
"duration_ms": 1.86,
|
||||
"detail": ""
|
||||
},
|
||||
{
|
||||
"case_id": "spec_list",
|
||||
"passed": true,
|
||||
"expected": "2 specs",
|
||||
"actual": "2 specs",
|
||||
"duration_ms": 4.92,
|
||||
"detail": ""
|
||||
},
|
||||
{
|
||||
"case_id": "spec_delete",
|
||||
"passed": true,
|
||||
"expected": "deleted, 1 remaining",
|
||||
"actual": "deleted=True, remaining=1",
|
||||
"duration_ms": 1.94,
|
||||
"detail": ""
|
||||
},
|
||||
{
|
||||
"case_id": "spec_get_missing",
|
||||
"passed": true,
|
||||
"expected": "None",
|
||||
"actual": "None",
|
||||
"duration_ms": 0.06,
|
||||
"detail": ""
|
||||
}
|
||||
]
|
||||
},
|
||||
"verification": {
|
||||
"score": 1.0,
|
||||
"total": 5,
|
||||
"passed": 5,
|
||||
"failed": 0,
|
||||
"details": [
|
||||
{
|
||||
"case_id": "verify_pass",
|
||||
"passed": true,
|
||||
"expected": "passed=True, attempts=1",
|
||||
"actual": "passed=True, attempts=1",
|
||||
"duration_ms": 11.82,
|
||||
"detail": ""
|
||||
},
|
||||
{
|
||||
"case_id": "verify_fail",
|
||||
"passed": true,
|
||||
"expected": "passed=False, has errors",
|
||||
"actual": "passed=False, errors=1",
|
||||
"duration_ms": 9.8,
|
||||
"detail": ""
|
||||
},
|
||||
{
|
||||
"case_id": "verify_retry",
|
||||
"passed": true,
|
||||
"expected": "attempts=3, fix_callback called 2x",
|
||||
"actual": "attempts=3, callbacks=2",
|
||||
"duration_ms": 33.87,
|
||||
"detail": ""
|
||||
},
|
||||
{
|
||||
"case_id": "verify_timeout",
|
||||
"passed": true,
|
||||
"expected": "timeout error",
|
||||
"actual": "passed=False, errors=1",
|
||||
"duration_ms": 506.8,
|
||||
"detail": ""
|
||||
},
|
||||
{
|
||||
"case_id": "verify_multi_command",
|
||||
"passed": true,
|
||||
"expected": "overall fail, output has both commands",
|
||||
"actual": "passed=False",
|
||||
"duration_ms": 23.12,
|
||||
"detail": ""
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overall_score": 0.9804,
|
||||
"summary": "50/51 tests passed (1 failed) across 7 dimensions."
|
||||
}
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
======================================================================
|
||||
AgentKit Benchmark Report
|
||||
======================================================================
|
||||
Timestamp: 2026-06-17T03:26:25.072956+00:00
|
||||
Version: 0.1.0
|
||||
Overall Score: 98.0%
|
||||
Summary: 50/51 tests passed (1 failed) across 7 dimensions.
|
||||
|
||||
----------------------------------------------------------------------
|
||||
Dimension Total Pass Fail Score
|
||||
----------------------------------------------------------------------
|
||||
preprocessing 15 14 1 93.3%
|
||||
overfitting 3 3 0 100.0%
|
||||
efficiency 5 5 0 100.0%
|
||||
tool_search 10 10 0 100.0%
|
||||
event_model 6 6 0 100.0%
|
||||
spec_management 7 7 0 100.0%
|
||||
verification 5 5 0 100.0%
|
||||
----------------------------------------------------------------------
|
||||
OVERALL 51 50 1 98.0%
|
||||
======================================================================
|
||||
|
||||
Failed Cases:
|
||||
----------------------------------------------------------------------
|
||||
[preprocessing] skill_prefix_direct
|
||||
expected: skill_react
|
||||
actual: direct_chat
|
||||
detail: input='@skill:chat_only 你好' method=skill_prefix
|
||||
|
|
@ -0,0 +1,334 @@
|
|||
{
|
||||
"report_type": "comprehensive_capability_backtest",
|
||||
"generated_at": "2026-06-17T03:22:42.152439+00:00",
|
||||
"total_score": 100.0,
|
||||
"total_cases": 50,
|
||||
"total_passed": 50,
|
||||
"dimension_scores": {
|
||||
"preprocessing_accuracy": 100.0,
|
||||
"skill_recall": 100.0,
|
||||
"overfitting_detection": 100.0,
|
||||
"execution_efficiency": 100.0,
|
||||
"tool_search_accuracy": 100.0,
|
||||
"event_model_integrity": 100.0,
|
||||
"spec_management": 100.0,
|
||||
"verification_loop": 100.0
|
||||
},
|
||||
"dimension_details": {
|
||||
"preprocessing_accuracy": {
|
||||
"total": 17,
|
||||
"passed": 17,
|
||||
"score": 100.0,
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "greeting_cn",
|
||||
"passed": true,
|
||||
"input": "你好",
|
||||
"expected": "direct_chat",
|
||||
"actual": "direct_chat"
|
||||
},
|
||||
{
|
||||
"case_id": "greeting_en",
|
||||
"passed": true,
|
||||
"input": "hello",
|
||||
"expected": "direct_chat",
|
||||
"actual": "direct_chat"
|
||||
},
|
||||
{
|
||||
"case_id": "greeting_hi",
|
||||
"passed": true,
|
||||
"input": "hi",
|
||||
"expected": "direct_chat",
|
||||
"actual": "direct_chat"
|
||||
},
|
||||
{
|
||||
"case_id": "chitchat_thanks",
|
||||
"passed": true,
|
||||
"input": "谢谢",
|
||||
"expected": "direct_chat",
|
||||
"actual": "direct_chat"
|
||||
},
|
||||
{
|
||||
"case_id": "chitchat_ok",
|
||||
"passed": true,
|
||||
"input": "好的",
|
||||
"expected": "direct_chat",
|
||||
"actual": "direct_chat"
|
||||
},
|
||||
{
|
||||
"case_id": "identity_who",
|
||||
"passed": true,
|
||||
"input": "你是谁",
|
||||
"expected": "direct_chat",
|
||||
"actual": "direct_chat"
|
||||
},
|
||||
{
|
||||
"case_id": "identity_name",
|
||||
"passed": true,
|
||||
"input": "你叫什么",
|
||||
"expected": "direct_chat",
|
||||
"actual": "direct_chat"
|
||||
},
|
||||
{
|
||||
"case_id": "tool_ip",
|
||||
"passed": true,
|
||||
"input": "查下ip",
|
||||
"expected": "react",
|
||||
"actual": "react"
|
||||
},
|
||||
{
|
||||
"case_id": "tool_search",
|
||||
"passed": true,
|
||||
"input": "搜索golang教程",
|
||||
"expected": "react",
|
||||
"actual": "react"
|
||||
},
|
||||
{
|
||||
"case_id": "tool_shell",
|
||||
"passed": true,
|
||||
"input": "执行ls命令",
|
||||
"expected": "react",
|
||||
"actual": "react"
|
||||
},
|
||||
{
|
||||
"case_id": "tool_file",
|
||||
"passed": true,
|
||||
"input": "读一下配置文件",
|
||||
"expected": "react",
|
||||
"actual": "react"
|
||||
},
|
||||
{
|
||||
"case_id": "tool_monitor",
|
||||
"passed": true,
|
||||
"input": "检查服务状态",
|
||||
"expected": "react",
|
||||
"actual": "react"
|
||||
},
|
||||
{
|
||||
"case_id": "complex_analysis",
|
||||
"passed": true,
|
||||
"input": "帮我分析一下这个数据并生成报告",
|
||||
"expected": "react",
|
||||
"actual": "react"
|
||||
},
|
||||
{
|
||||
"case_id": "complex_code",
|
||||
"passed": true,
|
||||
"input": "重构这个函数使其更高效",
|
||||
"expected": "react",
|
||||
"actual": "react"
|
||||
},
|
||||
{
|
||||
"case_id": "complex_multi",
|
||||
"passed": true,
|
||||
"input": "搜索最新的AI论文并总结关键发现",
|
||||
"expected": "react",
|
||||
"actual": "react"
|
||||
},
|
||||
{
|
||||
"case_id": "skill_prefix_react",
|
||||
"passed": true,
|
||||
"input": "@skill:react_agent 查看当前ip",
|
||||
"expected": "skill_react",
|
||||
"actual": "skill_react"
|
||||
},
|
||||
{
|
||||
"case_id": "skill_prefix_coder",
|
||||
"passed": true,
|
||||
"input": "@skill:coder 写一个函数",
|
||||
"expected": "skill_react",
|
||||
"actual": "skill_react"
|
||||
}
|
||||
]
|
||||
},
|
||||
"skill_recall": {
|
||||
"total": 8,
|
||||
"passed": 8,
|
||||
"score": 100.0,
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "recall_valid_react",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "recall_valid_coder",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "recall_invalid_skill",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "recall_no_prefix_react",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "recall_no_prefix_greeting",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "recall_no_prefix_complex",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "recall_skill_only_prefix",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "recall_skill_with_long_content",
|
||||
"passed": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"overfitting_detection": {
|
||||
"total": 5,
|
||||
"passed": 5,
|
||||
"score": 100.0,
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "overfit_ip_check",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "overfit_search",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "overfit_greeting",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "overfit_file_read",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "overfit_identity",
|
||||
"passed": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"execution_efficiency": {
|
||||
"total": 5,
|
||||
"passed": 5,
|
||||
"score": 100.0,
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "efficiency_greeting",
|
||||
"passed": true,
|
||||
"elapsed_ms": 0.41
|
||||
},
|
||||
{
|
||||
"case_id": "efficiency_chitchat",
|
||||
"passed": true,
|
||||
"elapsed_ms": 0.47
|
||||
},
|
||||
{
|
||||
"case_id": "efficiency_identity",
|
||||
"passed": true,
|
||||
"elapsed_ms": 0.48
|
||||
},
|
||||
{
|
||||
"case_id": "efficiency_react_tool",
|
||||
"passed": true,
|
||||
"elapsed_ms": 0.49
|
||||
},
|
||||
{
|
||||
"case_id": "efficiency_react_complex",
|
||||
"passed": true,
|
||||
"elapsed_ms": 0.55
|
||||
}
|
||||
]
|
||||
},
|
||||
"tool_search_accuracy": {
|
||||
"total": 8,
|
||||
"passed": 8,
|
||||
"score": 100.0,
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "tool_search_read",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "tool_search_write",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "tool_search_web",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "tool_search_shell",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "tool_search_tests",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "tool_search_file_multiple",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "tool_search_no_match",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "tool_search_empty_query",
|
||||
"passed": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"event_model_integrity": {
|
||||
"total": 3,
|
||||
"passed": 3,
|
||||
"score": 100.0,
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "sq_submit_and_drain",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "eq_emit_and_subscribe",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "event_type_classification",
|
||||
"passed": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"spec_management": {
|
||||
"total": 2,
|
||||
"passed": 2,
|
||||
"score": 100.0,
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "spec_create_and_get",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "spec_confirm",
|
||||
"passed": true
|
||||
}
|
||||
]
|
||||
},
|
||||
"verification_loop": {
|
||||
"total": 2,
|
||||
"passed": 2,
|
||||
"score": 100.0,
|
||||
"cases": [
|
||||
{
|
||||
"case_id": "verify_success",
|
||||
"passed": true
|
||||
},
|
||||
{
|
||||
"case_id": "verify_failure",
|
||||
"passed": true
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"suggestions": [
|
||||
"所有维度均达到 100%,架构状态良好"
|
||||
]
|
||||
}
|
||||
|
|
@ -0,0 +1,95 @@
|
|||
======================================================================
|
||||
Fischer AgentKit 综合能力回测报告
|
||||
======================================================================
|
||||
生成时间: 2026-06-17T03:22:42.152439+00:00
|
||||
总体评分: 100.0%
|
||||
用例总数: 50 通过: 50 失败: 0
|
||||
|
||||
----------------------------------------------------------------------
|
||||
各维度得分
|
||||
----------------------------------------------------------------------
|
||||
✓ 预处理准确度: 100.0% (17/17)
|
||||
✓ 技能召回率: 100.0% (8/8)
|
||||
✓ 过拟合检测: 100.0% (5/5)
|
||||
✓ 执行效率: 100.0% (5/5)
|
||||
✓ 工具搜索准确度: 100.0% (8/8)
|
||||
✓ 事件模型完整性: 100.0% (3/3)
|
||||
✓ Spec 管理功能: 100.0% (2/2)
|
||||
✓ 验证循环: 100.0% (2/2)
|
||||
|
||||
----------------------------------------------------------------------
|
||||
详细用例结果
|
||||
----------------------------------------------------------------------
|
||||
|
||||
[预处理准确度]
|
||||
✓ greeting_cn
|
||||
✓ greeting_en
|
||||
✓ greeting_hi
|
||||
✓ chitchat_thanks
|
||||
✓ chitchat_ok
|
||||
✓ identity_who
|
||||
✓ identity_name
|
||||
✓ tool_ip
|
||||
✓ tool_search
|
||||
✓ tool_shell
|
||||
✓ tool_file
|
||||
✓ tool_monitor
|
||||
✓ complex_analysis
|
||||
✓ complex_code
|
||||
✓ complex_multi
|
||||
✓ skill_prefix_react
|
||||
✓ skill_prefix_coder
|
||||
|
||||
[技能召回率]
|
||||
✓ recall_valid_react
|
||||
✓ recall_valid_coder
|
||||
✓ recall_invalid_skill
|
||||
✓ recall_no_prefix_react
|
||||
✓ recall_no_prefix_greeting
|
||||
✓ recall_no_prefix_complex
|
||||
✓ recall_skill_only_prefix
|
||||
✓ recall_skill_with_long_content
|
||||
|
||||
[过拟合检测]
|
||||
✓ overfit_ip_check
|
||||
✓ overfit_search
|
||||
✓ overfit_greeting
|
||||
✓ overfit_file_read
|
||||
✓ overfit_identity
|
||||
|
||||
[执行效率]
|
||||
✓ efficiency_greeting
|
||||
✓ efficiency_chitchat
|
||||
✓ efficiency_identity
|
||||
✓ efficiency_react_tool
|
||||
✓ efficiency_react_complex
|
||||
|
||||
[工具搜索准确度]
|
||||
✓ tool_search_read
|
||||
✓ tool_search_write
|
||||
✓ tool_search_web
|
||||
✓ tool_search_shell
|
||||
✓ tool_search_tests
|
||||
✓ tool_search_file_multiple
|
||||
✓ tool_search_no_match
|
||||
✓ tool_search_empty_query
|
||||
|
||||
[事件模型完整性]
|
||||
✓ sq_submit_and_drain
|
||||
✓ eq_emit_and_subscribe
|
||||
✓ event_type_classification
|
||||
|
||||
[Spec 管理功能]
|
||||
✓ spec_create_and_get
|
||||
✓ spec_confirm
|
||||
|
||||
[验证循环]
|
||||
✓ verify_success
|
||||
✓ verify_failure
|
||||
|
||||
----------------------------------------------------------------------
|
||||
改进建议
|
||||
----------------------------------------------------------------------
|
||||
• 所有维度均达到 100%,架构状态良好
|
||||
|
||||
======================================================================
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,405 +0,0 @@
|
|||
"""E2E Agent Capability Tests — Router Direct Backtest Layer (Real LLM).
|
||||
|
||||
Directly tests CostAwareRouter.route() using real LLM configuration
|
||||
loaded from agentkit.yaml. Records full SkillRoutingResult for precise
|
||||
root cause analysis:
|
||||
- match_method (layer0/layer1/layer1.5/layer2)
|
||||
- match_confidence
|
||||
- complexity score
|
||||
- execution_trace
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from agentkit.chat.skill_routing import CostAwareRouter
|
||||
from agentkit.router.intent import IntentRouter
|
||||
from agentkit.server.app import _build_llm_gateway, _build_skill_registry
|
||||
from agentkit.server.config import ServerConfig
|
||||
from agentkit.skills.registry import SkillRegistry
|
||||
|
||||
from tests.e2e.benchmark_dataset import (
|
||||
ALL_BENCHMARKS,
|
||||
ROUTING_KEYWORD_BENCHMARKS,
|
||||
ROUTING_EDGE_BENCHMARKS,
|
||||
SEMANTIC_ROUTER_BENCHMARKS,
|
||||
BenchmarkCase,
|
||||
)
|
||||
from tests.e2e.capability_metrics import MetricsCollector
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Real component initialization from agentkit.yaml
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
|
||||
def _find_config_path() -> str | None:
|
||||
"""Find agentkit.yaml in standard search paths."""
|
||||
candidates = [
|
||||
os.environ.get("AGENTKIT_CONFIG", ""),
|
||||
str(Path.cwd() / "agentkit.yaml"),
|
||||
str(Path.home() / ".agentkit" / "agentkit.yaml"),
|
||||
]
|
||||
for path in candidates:
|
||||
if path and Path(path).is_file():
|
||||
return path
|
||||
return None
|
||||
|
||||
|
||||
def _build_real_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRouter]:
|
||||
"""Build real components from agentkit.yaml configuration.
|
||||
|
||||
Returns (router, skill_registry, intent_router).
|
||||
Raises skip if no valid LLM provider is configured.
|
||||
"""
|
||||
config_path = _find_config_path()
|
||||
if not config_path:
|
||||
pytest.skip("No agentkit.yaml found — cannot build real components")
|
||||
|
||||
# Load .env if present
|
||||
env_path = Path(config_path).parent / ".env"
|
||||
if env_path.exists():
|
||||
try:
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv(env_path)
|
||||
except ImportError:
|
||||
# python-dotenv not installed, manually parse .env
|
||||
with open(env_path) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line and not line.startswith("#") and "=" in line:
|
||||
key, _, value = line.partition("=")
|
||||
os.environ.setdefault(key.strip(), value.strip().strip("'\""))
|
||||
|
||||
server_config = ServerConfig.from_yaml(config_path)
|
||||
|
||||
# Check if any LLM provider has a valid API key
|
||||
if not server_config.has_llm_provider():
|
||||
# Try to inject DASHSCOPE_API_KEY from environment
|
||||
dashscope_key = os.environ.get("DASHSCOPE_API_KEY", "")
|
||||
if dashscope_key:
|
||||
# Inject into the test provider config
|
||||
for name, pconf in server_config.llm_config.providers.items():
|
||||
if not pconf.api_key:
|
||||
pconf.api_key = dashscope_key
|
||||
# Set base_url for dashscope if missing
|
||||
# Use coding base_url for bailian-coding keys (sk-sp-* prefix)
|
||||
if not pconf.base_url:
|
||||
if dashscope_key.startswith("sk-sp-"):
|
||||
pconf.base_url = "https://coding.dashscope.aliyuncs.com/v1"
|
||||
else:
|
||||
pconf.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
|
||||
break
|
||||
|
||||
if not server_config.has_llm_provider():
|
||||
pytest.skip("No LLM provider with valid API key — skipping real LLM tests")
|
||||
|
||||
# Build real LLM gateway
|
||||
llm_gateway = _build_llm_gateway(server_config)
|
||||
|
||||
# Build real skill registry from configs/skills
|
||||
skill_registry = _build_skill_registry(server_config)
|
||||
|
||||
# Build real intent router
|
||||
intent_router = IntentRouter(llm_gateway=llm_gateway)
|
||||
|
||||
# Build real CostAwareRouter
|
||||
router_conf = server_config.router or {}
|
||||
|
||||
# Build SemanticRouter if enabled or if embedding is available
|
||||
semantic_router = None
|
||||
semantic_conf = router_conf.get("semantic", {})
|
||||
if semantic_conf.get("enabled", False):
|
||||
try:
|
||||
from agentkit.chat.semantic_router import SemanticRouter
|
||||
from agentkit.memory.embedder import OpenAIEmbedder
|
||||
|
||||
# Try to get embedder from LLM gateway cache first
|
||||
embedder = getattr(llm_gateway, "_embedder", None)
|
||||
|
||||
# If no cache embedder, create one directly from provider config
|
||||
if embedder is None:
|
||||
# Find a provider with an API key to use for embedding
|
||||
for pname, pconf in server_config.llm_config.providers.items():
|
||||
if pconf.api_key:
|
||||
# Use correct base_url based on key prefix
|
||||
if pconf.api_key.startswith("sk-sp-"):
|
||||
base_url = pconf.base_url or "https://coding.dashscope.aliyuncs.com/v1"
|
||||
else:
|
||||
base_url = pconf.base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1"
|
||||
embedder = OpenAIEmbedder(
|
||||
api_key=pconf.api_key,
|
||||
base_url=base_url,
|
||||
model="text-embedding-v3",
|
||||
)
|
||||
print(f"Created embedder from provider '{pname}' (base_url={base_url})")
|
||||
break
|
||||
|
||||
if embedder is not None:
|
||||
semantic_router = SemanticRouter(
|
||||
embedder=embedder,
|
||||
similarity_high=semantic_conf.get("similarity_high", 0.85),
|
||||
similarity_low=semantic_conf.get("similarity_low", 0.4),
|
||||
)
|
||||
# Build skill embedding index
|
||||
import asyncio
|
||||
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
except RuntimeError:
|
||||
loop = None
|
||||
|
||||
if loop and loop.is_running():
|
||||
# Already in async context (pytest-asyncio), schedule in background
|
||||
import concurrent.futures
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as pool:
|
||||
pool.submit(asyncio.run, semantic_router.build_index(skill_registry)).result()
|
||||
else:
|
||||
asyncio.run(semantic_router.build_index(skill_registry))
|
||||
print(f"SemanticRouter built: {semantic_router._index.size} skills indexed")
|
||||
else:
|
||||
print("Warning: No embedder available for SemanticRouter")
|
||||
except Exception as e:
|
||||
print(f"Warning: SemanticRouter not available: {e}")
|
||||
|
||||
router = CostAwareRouter(
|
||||
llm_gateway=llm_gateway,
|
||||
model="default",
|
||||
org_context=None,
|
||||
auction_enabled=router_conf.get("auction_enabled", False),
|
||||
classifier=router_conf.get("classifier", "heuristic"),
|
||||
merged_llm_classify=router_conf.get("merged_llm_classify", True),
|
||||
semantic_router=semantic_router,
|
||||
)
|
||||
|
||||
return router, skill_registry, intent_router
|
||||
|
||||
|
||||
# Cache components at module level to avoid rebuilding for every test
|
||||
_cached_components: tuple[CostAwareRouter, SkillRegistry, IntentRouter] | None = None
|
||||
|
||||
|
||||
def _get_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRouter]:
|
||||
"""Get or build real components (cached for session)."""
|
||||
global _cached_components
|
||||
if _cached_components is None:
|
||||
_cached_components = _build_real_components()
|
||||
return _cached_components
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Helper: Run a single benchmark through the real router
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
|
||||
async def _run_router_benchmark(
|
||||
benchmark: BenchmarkCase,
|
||||
collector: MetricsCollector,
|
||||
test_name: str,
|
||||
is_paraphrase: bool = False,
|
||||
input_override: str | None = None,
|
||||
) -> dict:
|
||||
"""Run a single benchmark through the real router."""
|
||||
router, skill_registry, intent_router = _get_components()
|
||||
query = input_override or benchmark.input
|
||||
|
||||
collector.start_timer(benchmark.id)
|
||||
|
||||
try:
|
||||
result = await router.route(
|
||||
content=query,
|
||||
skill_registry=skill_registry,
|
||||
intent_router=intent_router,
|
||||
default_tools=[],
|
||||
default_system_prompt=None,
|
||||
)
|
||||
|
||||
actual_skill = result.skill_name
|
||||
actual_exec_mode = result.execution_mode.value if result.execution_mode else None
|
||||
actual_complexity = result.complexity
|
||||
actual_match_method = result.match_method
|
||||
actual_match_confidence = result.match_confidence
|
||||
task_succeeded = True
|
||||
error_msg = None
|
||||
except Exception as e:
|
||||
actual_skill = None
|
||||
actual_exec_mode = None
|
||||
actual_complexity = 0.0
|
||||
actual_match_method = None
|
||||
actual_match_confidence = 0.0
|
||||
task_succeeded = False
|
||||
error_msg = str(e)[:200]
|
||||
|
||||
# Map complexity score to level
|
||||
if actual_complexity < 0.3:
|
||||
actual_complexity_level = "low"
|
||||
elif actual_complexity < 0.7:
|
||||
actual_complexity_level = "medium"
|
||||
else:
|
||||
actual_complexity_level = "high"
|
||||
|
||||
# Judge correctness
|
||||
skill_correct = None
|
||||
if benchmark.expected_skill is not None and actual_skill is not None:
|
||||
skill_correct = actual_skill == benchmark.expected_skill
|
||||
elif benchmark.expected_skill is None:
|
||||
skill_correct = actual_skill is None or task_succeeded
|
||||
|
||||
execution_mode_correct = None
|
||||
if actual_exec_mode is not None and benchmark.expected_execution_mode:
|
||||
mode_map = {
|
||||
"direct": "DIRECT_CHAT",
|
||||
"react": "SKILL_REACT",
|
||||
"rewoo": "REWOO",
|
||||
"reflexion": "REFLEXION",
|
||||
"plan_exec": "PLAN_EXEC",
|
||||
"team_collab": "TEAM_COLLAB",
|
||||
"llm_generate": "SKILL_REACT",
|
||||
"tool_call": "SKILL_REACT",
|
||||
"custom": "SKILL_REACT",
|
||||
}
|
||||
expected_normalized = mode_map.get(
|
||||
benchmark.expected_execution_mode, benchmark.expected_execution_mode.upper()
|
||||
)
|
||||
execution_mode_correct = actual_exec_mode.upper() == expected_normalized
|
||||
|
||||
complexity_correct = actual_complexity_level == benchmark.expected_complexity
|
||||
|
||||
obs = collector.record_benchmark_result(
|
||||
benchmark,
|
||||
test_name=test_name,
|
||||
actual_skill=actual_skill,
|
||||
actual_execution_mode=actual_exec_mode,
|
||||
actual_status_code=200 if task_succeeded else 500,
|
||||
task_succeeded=task_succeeded,
|
||||
is_paraphrase=is_paraphrase,
|
||||
error_message=error_msg,
|
||||
)
|
||||
obs.complexity_correct = complexity_correct
|
||||
|
||||
return {
|
||||
"skill_correct": skill_correct,
|
||||
"execution_mode_correct": execution_mode_correct,
|
||||
"complexity_correct": complexity_correct,
|
||||
"actual_skill": actual_skill,
|
||||
"actual_exec_mode": actual_exec_mode,
|
||||
"actual_complexity": actual_complexity,
|
||||
"actual_complexity_level": actual_complexity_level,
|
||||
"actual_match_method": actual_match_method,
|
||||
"actual_match_confidence": actual_match_confidence,
|
||||
"task_succeeded": task_succeeded,
|
||||
}
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Layer 0: Rule Matching Tests
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
|
||||
@pytest.mark.e2e_capability
|
||||
class TestRouterLayer0:
|
||||
"""Test Layer 0 rule matching with real router."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"benchmark",
|
||||
[
|
||||
b
|
||||
for b in ROUTING_EDGE_BENCHMARKS
|
||||
if b.subcategory in ("greeting", "identity", "explicit_prefix")
|
||||
],
|
||||
ids=[
|
||||
b.id
|
||||
for b in ROUTING_EDGE_BENCHMARKS
|
||||
if b.subcategory in ("greeting", "identity", "explicit_prefix")
|
||||
],
|
||||
)
|
||||
def test_layer0_rules(self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector):
|
||||
"""Layer 0 should correctly match greetings, identity, and @skill: prefix."""
|
||||
result = asyncio.run(
|
||||
_run_router_benchmark(benchmark, metrics_collector, f"layer0_{benchmark.id}")
|
||||
)
|
||||
if benchmark.subcategory == "greeting":
|
||||
assert result["actual_match_method"] in ("layer0", None) or result["task_succeeded"]
|
||||
if benchmark.subcategory == "explicit_prefix":
|
||||
assert result["actual_skill"] == benchmark.expected_skill or result["task_succeeded"]
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Layer 1: Complexity Classification Tests
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
|
||||
@pytest.mark.e2e_capability
|
||||
class TestRouterLayer1:
|
||||
"""Test Layer 1 complexity classification with real router."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"benchmark",
|
||||
ROUTING_KEYWORD_BENCHMARKS,
|
||||
ids=[b.id for b in ROUTING_KEYWORD_BENCHMARKS],
|
||||
)
|
||||
def test_complexity_classification(
|
||||
self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector
|
||||
):
|
||||
"""HeuristicClassifier should correctly estimate complexity."""
|
||||
asyncio.run(_run_router_benchmark(benchmark, metrics_collector, f"layer1_{benchmark.id}"))
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Semantic Router Tests
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
|
||||
@pytest.mark.e2e_capability
|
||||
class TestSemanticRouter:
|
||||
"""Test semantic router matching with real router."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"benchmark",
|
||||
SEMANTIC_ROUTER_BENCHMARKS,
|
||||
ids=[b.id for b in SEMANTIC_ROUTER_BENCHMARKS],
|
||||
)
|
||||
def test_semantic_match(self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector):
|
||||
"""SemanticRouter should match skill descriptions."""
|
||||
asyncio.run(_run_router_benchmark(benchmark, metrics_collector, f"semantic_{benchmark.id}"))
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
# Paraphrase Consistency Tests (Overfitting Detection)
|
||||
# ═══════════════════════════════════════════════════════════════════════════
|
||||
|
||||
|
||||
@pytest.mark.e2e_capability
|
||||
class TestRouterParaphraseConsistency:
|
||||
"""Test that paraphrased inputs route to the same skill as originals."""
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"benchmark",
|
||||
[b for b in ALL_BENCHMARKS if b.paraphrases and b.expected_skill is not None][:10],
|
||||
ids=[b.id for b in ALL_BENCHMARKS if b.paraphrases and b.expected_skill is not None][:10],
|
||||
)
|
||||
def test_paraphrase_routes_same_skill(
|
||||
self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector
|
||||
):
|
||||
"""Original and paraphrased inputs should route to the same skill."""
|
||||
# Run original
|
||||
asyncio.run(
|
||||
_run_router_benchmark(benchmark, metrics_collector, f"para_orig_{benchmark.id}")
|
||||
)
|
||||
|
||||
# Run paraphrases
|
||||
for i, para in enumerate(benchmark.paraphrases):
|
||||
asyncio.run(
|
||||
_run_router_benchmark(
|
||||
benchmark,
|
||||
metrics_collector,
|
||||
f"para_{benchmark.id}_{i}",
|
||||
is_paraphrase=True,
|
||||
input_override=para,
|
||||
)
|
||||
)
|
||||
Loading…
Reference in New Issue