fischer-agentkit/test-results/benchmark/benchmark_report.json

472 lines
13 KiB
JSON

{
"timestamp": "2026-06-17T03:26:25.072956+00:00",
"version": "0.1.0",
"dimensions": {
"preprocessing": {
"score": 0.9333,
"total": 15,
"passed": 14,
"failed": 1,
"details": [
{
"case_id": "greeting_cn",
"passed": true,
"expected": "direct_chat",
"actual": "direct_chat",
"duration_ms": 0.03,
"detail": "input='你好' method=regex_direct"
},
{
"case_id": "greeting_en",
"passed": true,
"expected": "direct_chat",
"actual": "direct_chat",
"duration_ms": 0.02,
"detail": "input='hello' method=regex_direct"
},
{
"case_id": "chitchat_thanks",
"passed": true,
"expected": "direct_chat",
"actual": "direct_chat",
"duration_ms": 0.01,
"detail": "input='谢谢' method=regex_direct"
},
{
"case_id": "identity_who",
"passed": true,
"expected": "direct_chat",
"actual": "direct_chat",
"duration_ms": 0.02,
"detail": "input='你是谁' method=regex_direct"
},
{
"case_id": "colloquial_ip_1",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.02,
"detail": "input='查下ip' method=default_react"
},
{
"case_id": "colloquial_ip_2",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.01,
"detail": "input='查看当前ip' method=default_react"
},
{
"case_id": "tool_search",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.01,
"detail": "input='搜索golang教程' method=default_react"
},
{
"case_id": "tool_shell",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.01,
"detail": "input='执行ls命令' method=default_react"
},
{
"case_id": "translation",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.01,
"detail": "input='翻译hello为中文' method=default_react"
},
{
"case_id": "knowledge",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.01,
"detail": "input='什么是机器学习' method=default_react"
},
{
"case_id": "skill_prefix_react",
"passed": true,
"expected": "skill_react",
"actual": "skill_react",
"duration_ms": 0.03,
"detail": "input='@skill:react_agent 查看ip' method=skill_prefix"
},
{
"case_id": "skill_prefix_direct",
"passed": false,
"expected": "skill_react",
"actual": "direct_chat",
"duration_ms": 0.02,
"detail": "input='@skill:chat_only 你好' method=skill_prefix"
},
{
"case_id": "skill_not_found",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.13,
"detail": "input='@skill:nonexistent 做点什么' method=skill_not_found_fallback"
},
{
"case_id": "complex_analysis",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.01,
"detail": "input='帮我分析一下这个数据并生成报告' method=default_react"
},
{
"case_id": "empty_fallback",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.01,
"detail": "input='随便聊聊' method=default_react"
}
]
},
"overfitting": {
"score": 1.0,
"total": 3,
"passed": 3,
"failed": 0,
"details": [
{
"case_id": "ip_check_variants",
"passed": true,
"expected": "react",
"actual": "react,react,react,react,react",
"duration_ms": 0.0,
"detail": "paraphrases=5 consistent=True"
},
{
"case_id": "search_variants",
"passed": true,
"expected": "react",
"actual": "react,react,react",
"duration_ms": 0.0,
"detail": "paraphrases=3 consistent=True"
},
{
"case_id": "greeting_variants",
"passed": true,
"expected": "direct_chat",
"actual": "direct_chat,direct_chat,direct_chat,direct_chat,direct_chat",
"duration_ms": 0.0,
"detail": "paraphrases=5 consistent=True"
}
]
},
"efficiency": {
"score": 1.0,
"total": 5,
"passed": 5,
"failed": 0,
"details": [
{
"case_id": "preprocess_greeting",
"passed": true,
"expected": "<= 50.0ms/call",
"actual": "0.004ms/call",
"duration_ms": 0.44,
"detail": "iterations=100"
},
{
"case_id": "preprocess_react",
"passed": true,
"expected": "<= 50.0ms/call",
"actual": "0.004ms/call",
"duration_ms": 0.38,
"detail": "iterations=100"
},
{
"case_id": "preprocess_skill_prefix",
"passed": true,
"expected": "<= 50.0ms/call",
"actual": "0.005ms/call",
"duration_ms": 0.51,
"detail": "iterations=100"
},
{
"case_id": "tool_search_query",
"passed": true,
"expected": "<= 10.0ms/call",
"actual": "0.008ms/call",
"duration_ms": 1.69,
"detail": "iterations=200"
},
{
"case_id": "tool_search_empty",
"passed": true,
"expected": "<= 5.0ms/call",
"actual": "0.000ms/call",
"duration_ms": 0.08,
"detail": "iterations=200"
}
]
},
"tool_search": {
"score": 1.0,
"total": 10,
"passed": 10,
"failed": 0,
"details": [
{
"case_id": "read_file_query",
"passed": true,
"expected": "read_file",
"actual": "read_file",
"duration_ms": 0.02,
"detail": "query='read file' top_k=5 results=2"
},
{
"case_id": "write_file_query",
"passed": true,
"expected": "write_file",
"actual": "write_file",
"duration_ms": 0.02,
"detail": "query='write file content' top_k=5 results=2"
},
{
"case_id": "web_search_query",
"passed": true,
"expected": "web_search",
"actual": "web_search",
"duration_ms": 0.02,
"detail": "query='search web information' top_k=5 results=2"
},
{
"case_id": "shell_exec_query",
"passed": true,
"expected": "shell_exec",
"actual": "shell_exec",
"duration_ms": 0.02,
"detail": "query='execute shell command' top_k=5 results=1"
},
{
"case_id": "http_request_query",
"passed": true,
"expected": "http_request",
"actual": "http_request",
"duration_ms": 0.03,
"detail": "query='send http request url' top_k=5 results=1"
},
{
"case_id": "file_tag_query",
"passed": true,
"expected": "read_file",
"actual": "read_file",
"duration_ms": 0.02,
"detail": "query='io file' top_k=5 results=2"
},
{
"case_id": "empty_query",
"passed": true,
"expected": "__none__",
"actual": "[]",
"duration_ms": 0.0,
"detail": "query='' top_k=5 results=0"
},
{
"case_id": "no_match_query",
"passed": true,
"expected": "__none__",
"actual": "[]",
"duration_ms": 0.01,
"detail": "query='zzzznonexistent' top_k=5 results=0"
},
{
"case_id": "top_k_limit",
"passed": true,
"expected": "read_file",
"actual": "read_file",
"duration_ms": 0.02,
"detail": "query='file' top_k=1 results=1"
},
{
"case_id": "multi_token_query",
"passed": true,
"expected": "web_search",
"actual": "web_search",
"duration_ms": 0.03,
"detail": "query='search query engine' top_k=5 results=1"
}
]
},
"event_model": {
"score": 1.0,
"total": 6,
"passed": 6,
"failed": 0,
"details": [
{
"case_id": "sq_submit_drain",
"passed": true,
"expected": "task_id + drained=['hello']",
"actual": "task_id=571839fb... drained=['hello']",
"duration_ms": 0.1,
"detail": ""
},
{
"case_id": "sq_cancel",
"passed": true,
"expected": "cancelled=True",
"actual": "cancelled=True",
"duration_ms": 0.04,
"detail": ""
},
{
"case_id": "sq_close_blocks",
"passed": true,
"expected": "RuntimeError on submit after close",
"actual": "raised=True closed=True",
"duration_ms": 0.02,
"detail": ""
},
{
"case_id": "eq_emit_subscribe_replay",
"passed": true,
"expected": "1 event replayed",
"actual": "1 events",
"duration_ms": 0.07,
"detail": ""
},
{
"case_id": "eq_close_sentinel",
"passed": true,
"expected": "subscriber exits on close",
"actual": "1 events, closed=True",
"duration_ms": 21.59,
"detail": ""
},
{
"case_id": "eq_subscriber_count",
"passed": true,
"expected": "0 subscribers initially",
"actual": "0 subscribers",
"duration_ms": 0.01,
"detail": ""
}
]
},
"spec_management": {
"score": 1.0,
"total": 7,
"passed": 7,
"failed": 0,
"details": [
{
"case_id": "spec_create",
"passed": true,
"expected": "file exists on disk",
"actual": "exists=True",
"duration_ms": 2.24,
"detail": ""
},
{
"case_id": "spec_get",
"passed": true,
"expected": "spec with 2 steps",
"actual": "steps=2",
"duration_ms": 0.0,
"detail": ""
},
{
"case_id": "spec_update",
"passed": true,
"expected": "goal='Updated goal'",
"actual": "goal=Updated goal",
"duration_ms": 1.75,
"detail": ""
},
{
"case_id": "spec_confirm",
"passed": true,
"expected": "status=confirmed, all steps confirmed",
"actual": "status=confirmed",
"duration_ms": 1.86,
"detail": ""
},
{
"case_id": "spec_list",
"passed": true,
"expected": "2 specs",
"actual": "2 specs",
"duration_ms": 4.92,
"detail": ""
},
{
"case_id": "spec_delete",
"passed": true,
"expected": "deleted, 1 remaining",
"actual": "deleted=True, remaining=1",
"duration_ms": 1.94,
"detail": ""
},
{
"case_id": "spec_get_missing",
"passed": true,
"expected": "None",
"actual": "None",
"duration_ms": 0.06,
"detail": ""
}
]
},
"verification": {
"score": 1.0,
"total": 5,
"passed": 5,
"failed": 0,
"details": [
{
"case_id": "verify_pass",
"passed": true,
"expected": "passed=True, attempts=1",
"actual": "passed=True, attempts=1",
"duration_ms": 11.82,
"detail": ""
},
{
"case_id": "verify_fail",
"passed": true,
"expected": "passed=False, has errors",
"actual": "passed=False, errors=1",
"duration_ms": 9.8,
"detail": ""
},
{
"case_id": "verify_retry",
"passed": true,
"expected": "attempts=3, fix_callback called 2x",
"actual": "attempts=3, callbacks=2",
"duration_ms": 33.87,
"detail": ""
},
{
"case_id": "verify_timeout",
"passed": true,
"expected": "timeout error",
"actual": "passed=False, errors=1",
"duration_ms": 506.8,
"detail": ""
},
{
"case_id": "verify_multi_command",
"passed": true,
"expected": "overall fail, output has both commands",
"actual": "passed=False",
"duration_ms": 23.12,
"detail": ""
}
]
}
},
"overall_score": 0.9804,
"summary": "50/51 tests passed (1 failed) across 7 dimensions."
}