472 lines
13 KiB
JSON
472 lines
13 KiB
JSON
{
|
|
"timestamp": "2026-06-17T03:26:25.072956+00:00",
|
|
"version": "0.1.0",
|
|
"dimensions": {
|
|
"preprocessing": {
|
|
"score": 0.9333,
|
|
"total": 15,
|
|
"passed": 14,
|
|
"failed": 1,
|
|
"details": [
|
|
{
|
|
"case_id": "greeting_cn",
|
|
"passed": true,
|
|
"expected": "direct_chat",
|
|
"actual": "direct_chat",
|
|
"duration_ms": 0.03,
|
|
"detail": "input='你好' method=regex_direct"
|
|
},
|
|
{
|
|
"case_id": "greeting_en",
|
|
"passed": true,
|
|
"expected": "direct_chat",
|
|
"actual": "direct_chat",
|
|
"duration_ms": 0.02,
|
|
"detail": "input='hello' method=regex_direct"
|
|
},
|
|
{
|
|
"case_id": "chitchat_thanks",
|
|
"passed": true,
|
|
"expected": "direct_chat",
|
|
"actual": "direct_chat",
|
|
"duration_ms": 0.01,
|
|
"detail": "input='谢谢' method=regex_direct"
|
|
},
|
|
{
|
|
"case_id": "identity_who",
|
|
"passed": true,
|
|
"expected": "direct_chat",
|
|
"actual": "direct_chat",
|
|
"duration_ms": 0.02,
|
|
"detail": "input='你是谁' method=regex_direct"
|
|
},
|
|
{
|
|
"case_id": "colloquial_ip_1",
|
|
"passed": true,
|
|
"expected": "react",
|
|
"actual": "react",
|
|
"duration_ms": 0.02,
|
|
"detail": "input='查下ip' method=default_react"
|
|
},
|
|
{
|
|
"case_id": "colloquial_ip_2",
|
|
"passed": true,
|
|
"expected": "react",
|
|
"actual": "react",
|
|
"duration_ms": 0.01,
|
|
"detail": "input='查看当前ip' method=default_react"
|
|
},
|
|
{
|
|
"case_id": "tool_search",
|
|
"passed": true,
|
|
"expected": "react",
|
|
"actual": "react",
|
|
"duration_ms": 0.01,
|
|
"detail": "input='搜索golang教程' method=default_react"
|
|
},
|
|
{
|
|
"case_id": "tool_shell",
|
|
"passed": true,
|
|
"expected": "react",
|
|
"actual": "react",
|
|
"duration_ms": 0.01,
|
|
"detail": "input='执行ls命令' method=default_react"
|
|
},
|
|
{
|
|
"case_id": "translation",
|
|
"passed": true,
|
|
"expected": "react",
|
|
"actual": "react",
|
|
"duration_ms": 0.01,
|
|
"detail": "input='翻译hello为中文' method=default_react"
|
|
},
|
|
{
|
|
"case_id": "knowledge",
|
|
"passed": true,
|
|
"expected": "react",
|
|
"actual": "react",
|
|
"duration_ms": 0.01,
|
|
"detail": "input='什么是机器学习' method=default_react"
|
|
},
|
|
{
|
|
"case_id": "skill_prefix_react",
|
|
"passed": true,
|
|
"expected": "skill_react",
|
|
"actual": "skill_react",
|
|
"duration_ms": 0.03,
|
|
"detail": "input='@skill:react_agent 查看ip' method=skill_prefix"
|
|
},
|
|
{
|
|
"case_id": "skill_prefix_direct",
|
|
"passed": false,
|
|
"expected": "skill_react",
|
|
"actual": "direct_chat",
|
|
"duration_ms": 0.02,
|
|
"detail": "input='@skill:chat_only 你好' method=skill_prefix"
|
|
},
|
|
{
|
|
"case_id": "skill_not_found",
|
|
"passed": true,
|
|
"expected": "react",
|
|
"actual": "react",
|
|
"duration_ms": 0.13,
|
|
"detail": "input='@skill:nonexistent 做点什么' method=skill_not_found_fallback"
|
|
},
|
|
{
|
|
"case_id": "complex_analysis",
|
|
"passed": true,
|
|
"expected": "react",
|
|
"actual": "react",
|
|
"duration_ms": 0.01,
|
|
"detail": "input='帮我分析一下这个数据并生成报告' method=default_react"
|
|
},
|
|
{
|
|
"case_id": "empty_fallback",
|
|
"passed": true,
|
|
"expected": "react",
|
|
"actual": "react",
|
|
"duration_ms": 0.01,
|
|
"detail": "input='随便聊聊' method=default_react"
|
|
}
|
|
]
|
|
},
|
|
"overfitting": {
|
|
"score": 1.0,
|
|
"total": 3,
|
|
"passed": 3,
|
|
"failed": 0,
|
|
"details": [
|
|
{
|
|
"case_id": "ip_check_variants",
|
|
"passed": true,
|
|
"expected": "react",
|
|
"actual": "react,react,react,react,react",
|
|
"duration_ms": 0.0,
|
|
"detail": "paraphrases=5 consistent=True"
|
|
},
|
|
{
|
|
"case_id": "search_variants",
|
|
"passed": true,
|
|
"expected": "react",
|
|
"actual": "react,react,react",
|
|
"duration_ms": 0.0,
|
|
"detail": "paraphrases=3 consistent=True"
|
|
},
|
|
{
|
|
"case_id": "greeting_variants",
|
|
"passed": true,
|
|
"expected": "direct_chat",
|
|
"actual": "direct_chat,direct_chat,direct_chat,direct_chat,direct_chat",
|
|
"duration_ms": 0.0,
|
|
"detail": "paraphrases=5 consistent=True"
|
|
}
|
|
]
|
|
},
|
|
"efficiency": {
|
|
"score": 1.0,
|
|
"total": 5,
|
|
"passed": 5,
|
|
"failed": 0,
|
|
"details": [
|
|
{
|
|
"case_id": "preprocess_greeting",
|
|
"passed": true,
|
|
"expected": "<= 50.0ms/call",
|
|
"actual": "0.004ms/call",
|
|
"duration_ms": 0.44,
|
|
"detail": "iterations=100"
|
|
},
|
|
{
|
|
"case_id": "preprocess_react",
|
|
"passed": true,
|
|
"expected": "<= 50.0ms/call",
|
|
"actual": "0.004ms/call",
|
|
"duration_ms": 0.38,
|
|
"detail": "iterations=100"
|
|
},
|
|
{
|
|
"case_id": "preprocess_skill_prefix",
|
|
"passed": true,
|
|
"expected": "<= 50.0ms/call",
|
|
"actual": "0.005ms/call",
|
|
"duration_ms": 0.51,
|
|
"detail": "iterations=100"
|
|
},
|
|
{
|
|
"case_id": "tool_search_query",
|
|
"passed": true,
|
|
"expected": "<= 10.0ms/call",
|
|
"actual": "0.008ms/call",
|
|
"duration_ms": 1.69,
|
|
"detail": "iterations=200"
|
|
},
|
|
{
|
|
"case_id": "tool_search_empty",
|
|
"passed": true,
|
|
"expected": "<= 5.0ms/call",
|
|
"actual": "0.000ms/call",
|
|
"duration_ms": 0.08,
|
|
"detail": "iterations=200"
|
|
}
|
|
]
|
|
},
|
|
"tool_search": {
|
|
"score": 1.0,
|
|
"total": 10,
|
|
"passed": 10,
|
|
"failed": 0,
|
|
"details": [
|
|
{
|
|
"case_id": "read_file_query",
|
|
"passed": true,
|
|
"expected": "read_file",
|
|
"actual": "read_file",
|
|
"duration_ms": 0.02,
|
|
"detail": "query='read file' top_k=5 results=2"
|
|
},
|
|
{
|
|
"case_id": "write_file_query",
|
|
"passed": true,
|
|
"expected": "write_file",
|
|
"actual": "write_file",
|
|
"duration_ms": 0.02,
|
|
"detail": "query='write file content' top_k=5 results=2"
|
|
},
|
|
{
|
|
"case_id": "web_search_query",
|
|
"passed": true,
|
|
"expected": "web_search",
|
|
"actual": "web_search",
|
|
"duration_ms": 0.02,
|
|
"detail": "query='search web information' top_k=5 results=2"
|
|
},
|
|
{
|
|
"case_id": "shell_exec_query",
|
|
"passed": true,
|
|
"expected": "shell_exec",
|
|
"actual": "shell_exec",
|
|
"duration_ms": 0.02,
|
|
"detail": "query='execute shell command' top_k=5 results=1"
|
|
},
|
|
{
|
|
"case_id": "http_request_query",
|
|
"passed": true,
|
|
"expected": "http_request",
|
|
"actual": "http_request",
|
|
"duration_ms": 0.03,
|
|
"detail": "query='send http request url' top_k=5 results=1"
|
|
},
|
|
{
|
|
"case_id": "file_tag_query",
|
|
"passed": true,
|
|
"expected": "read_file",
|
|
"actual": "read_file",
|
|
"duration_ms": 0.02,
|
|
"detail": "query='io file' top_k=5 results=2"
|
|
},
|
|
{
|
|
"case_id": "empty_query",
|
|
"passed": true,
|
|
"expected": "__none__",
|
|
"actual": "[]",
|
|
"duration_ms": 0.0,
|
|
"detail": "query='' top_k=5 results=0"
|
|
},
|
|
{
|
|
"case_id": "no_match_query",
|
|
"passed": true,
|
|
"expected": "__none__",
|
|
"actual": "[]",
|
|
"duration_ms": 0.01,
|
|
"detail": "query='zzzznonexistent' top_k=5 results=0"
|
|
},
|
|
{
|
|
"case_id": "top_k_limit",
|
|
"passed": true,
|
|
"expected": "read_file",
|
|
"actual": "read_file",
|
|
"duration_ms": 0.02,
|
|
"detail": "query='file' top_k=1 results=1"
|
|
},
|
|
{
|
|
"case_id": "multi_token_query",
|
|
"passed": true,
|
|
"expected": "web_search",
|
|
"actual": "web_search",
|
|
"duration_ms": 0.03,
|
|
"detail": "query='search query engine' top_k=5 results=1"
|
|
}
|
|
]
|
|
},
|
|
"event_model": {
|
|
"score": 1.0,
|
|
"total": 6,
|
|
"passed": 6,
|
|
"failed": 0,
|
|
"details": [
|
|
{
|
|
"case_id": "sq_submit_drain",
|
|
"passed": true,
|
|
"expected": "task_id + drained=['hello']",
|
|
"actual": "task_id=571839fb... drained=['hello']",
|
|
"duration_ms": 0.1,
|
|
"detail": ""
|
|
},
|
|
{
|
|
"case_id": "sq_cancel",
|
|
"passed": true,
|
|
"expected": "cancelled=True",
|
|
"actual": "cancelled=True",
|
|
"duration_ms": 0.04,
|
|
"detail": ""
|
|
},
|
|
{
|
|
"case_id": "sq_close_blocks",
|
|
"passed": true,
|
|
"expected": "RuntimeError on submit after close",
|
|
"actual": "raised=True closed=True",
|
|
"duration_ms": 0.02,
|
|
"detail": ""
|
|
},
|
|
{
|
|
"case_id": "eq_emit_subscribe_replay",
|
|
"passed": true,
|
|
"expected": "1 event replayed",
|
|
"actual": "1 events",
|
|
"duration_ms": 0.07,
|
|
"detail": ""
|
|
},
|
|
{
|
|
"case_id": "eq_close_sentinel",
|
|
"passed": true,
|
|
"expected": "subscriber exits on close",
|
|
"actual": "1 events, closed=True",
|
|
"duration_ms": 21.59,
|
|
"detail": ""
|
|
},
|
|
{
|
|
"case_id": "eq_subscriber_count",
|
|
"passed": true,
|
|
"expected": "0 subscribers initially",
|
|
"actual": "0 subscribers",
|
|
"duration_ms": 0.01,
|
|
"detail": ""
|
|
}
|
|
]
|
|
},
|
|
"spec_management": {
|
|
"score": 1.0,
|
|
"total": 7,
|
|
"passed": 7,
|
|
"failed": 0,
|
|
"details": [
|
|
{
|
|
"case_id": "spec_create",
|
|
"passed": true,
|
|
"expected": "file exists on disk",
|
|
"actual": "exists=True",
|
|
"duration_ms": 2.24,
|
|
"detail": ""
|
|
},
|
|
{
|
|
"case_id": "spec_get",
|
|
"passed": true,
|
|
"expected": "spec with 2 steps",
|
|
"actual": "steps=2",
|
|
"duration_ms": 0.0,
|
|
"detail": ""
|
|
},
|
|
{
|
|
"case_id": "spec_update",
|
|
"passed": true,
|
|
"expected": "goal='Updated goal'",
|
|
"actual": "goal=Updated goal",
|
|
"duration_ms": 1.75,
|
|
"detail": ""
|
|
},
|
|
{
|
|
"case_id": "spec_confirm",
|
|
"passed": true,
|
|
"expected": "status=confirmed, all steps confirmed",
|
|
"actual": "status=confirmed",
|
|
"duration_ms": 1.86,
|
|
"detail": ""
|
|
},
|
|
{
|
|
"case_id": "spec_list",
|
|
"passed": true,
|
|
"expected": "2 specs",
|
|
"actual": "2 specs",
|
|
"duration_ms": 4.92,
|
|
"detail": ""
|
|
},
|
|
{
|
|
"case_id": "spec_delete",
|
|
"passed": true,
|
|
"expected": "deleted, 1 remaining",
|
|
"actual": "deleted=True, remaining=1",
|
|
"duration_ms": 1.94,
|
|
"detail": ""
|
|
},
|
|
{
|
|
"case_id": "spec_get_missing",
|
|
"passed": true,
|
|
"expected": "None",
|
|
"actual": "None",
|
|
"duration_ms": 0.06,
|
|
"detail": ""
|
|
}
|
|
]
|
|
},
|
|
"verification": {
|
|
"score": 1.0,
|
|
"total": 5,
|
|
"passed": 5,
|
|
"failed": 0,
|
|
"details": [
|
|
{
|
|
"case_id": "verify_pass",
|
|
"passed": true,
|
|
"expected": "passed=True, attempts=1",
|
|
"actual": "passed=True, attempts=1",
|
|
"duration_ms": 11.82,
|
|
"detail": ""
|
|
},
|
|
{
|
|
"case_id": "verify_fail",
|
|
"passed": true,
|
|
"expected": "passed=False, has errors",
|
|
"actual": "passed=False, errors=1",
|
|
"duration_ms": 9.8,
|
|
"detail": ""
|
|
},
|
|
{
|
|
"case_id": "verify_retry",
|
|
"passed": true,
|
|
"expected": "attempts=3, fix_callback called 2x",
|
|
"actual": "attempts=3, callbacks=2",
|
|
"duration_ms": 33.87,
|
|
"detail": ""
|
|
},
|
|
{
|
|
"case_id": "verify_timeout",
|
|
"passed": true,
|
|
"expected": "timeout error",
|
|
"actual": "passed=False, errors=1",
|
|
"duration_ms": 506.8,
|
|
"detail": ""
|
|
},
|
|
{
|
|
"case_id": "verify_multi_command",
|
|
"passed": true,
|
|
"expected": "overall fail, output has both commands",
|
|
"actual": "passed=False",
|
|
"duration_ms": 23.12,
|
|
"detail": ""
|
|
}
|
|
]
|
|
}
|
|
},
|
|
"overall_score": 0.9804,
|
|
"summary": "50/51 tests passed (1 failed) across 7 dimensions."
|
|
} |