2017 lines
55 KiB
JSON
2017 lines
55 KiB
JSON
{
|
||
"timestamp": "2026-06-17T04:52:53.863927+00:00",
|
||
"version": "0.1.0",
|
||
"mode": "all",
|
||
"runs": 1,
|
||
"fast": false,
|
||
"overall_accuracy": 0.9524,
|
||
"overall_accuracy_mean": 0.9524,
|
||
"overall_accuracy_std": 0.0,
|
||
"summary": "60/63 tests passed (3 failed) across 9 dimensions.",
|
||
"dimensions": {
|
||
"preprocessing": {
|
||
"metrics": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0128,
|
||
"latency_p95_ms": 0.057,
|
||
"latency_p99_ms": 0.1086,
|
||
"consistency": 1.0,
|
||
"total": 15,
|
||
"passed": 15,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.7961,
|
||
"ci_upper": 1.0
|
||
},
|
||
"by_category": {
|
||
"greeting": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0133,
|
||
"latency_p95_ms": 0.026,
|
||
"latency_p99_ms": 0.0275,
|
||
"consistency": 1.0,
|
||
"total": 4,
|
||
"passed": 4,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.5101,
|
||
"ci_upper": 1.0
|
||
},
|
||
"tool_query": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0115,
|
||
"latency_p95_ms": 0.0166,
|
||
"latency_p99_ms": 0.0172,
|
||
"consistency": 1.0,
|
||
"total": 5,
|
||
"passed": 5,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.5655,
|
||
"ci_upper": 1.0
|
||
},
|
||
"skill_prefix": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0294,
|
||
"latency_p95_ms": 0.1123,
|
||
"latency_p99_ms": 0.1197,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
},
|
||
"complex": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0101,
|
||
"latency_p95_ms": 0.0125,
|
||
"latency_p99_ms": 0.0127,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"by_difficulty": {
|
||
"easy": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0115,
|
||
"latency_p95_ms": 0.0253,
|
||
"latency_p99_ms": 0.0274,
|
||
"consistency": 1.0,
|
||
"total": 5,
|
||
"passed": 5,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.5655,
|
||
"ci_upper": 1.0
|
||
},
|
||
"medium": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0136,
|
||
"latency_p95_ms": 0.0263,
|
||
"latency_p99_ms": 0.0288,
|
||
"consistency": 1.0,
|
||
"total": 7,
|
||
"passed": 7,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.6457,
|
||
"ci_upper": 1.0
|
||
},
|
||
"hard": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0128,
|
||
"latency_p95_ms": 0.1106,
|
||
"latency_p99_ms": 0.1193,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"cases": [
|
||
{
|
||
"task_id": "prep-001",
|
||
"dimension": "preprocessing",
|
||
"category": "greeting",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "direct_chat",
|
||
"actual": "direct_chat",
|
||
"duration_ms": 0.0279,
|
||
"root_cause": "none",
|
||
"detail": "input='你好' method=regex_direct",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-002",
|
||
"dimension": "preprocessing",
|
||
"category": "greeting",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "direct_chat",
|
||
"actual": "direct_chat",
|
||
"duration_ms": 0.0151,
|
||
"root_cause": "none",
|
||
"detail": "input='hello' method=regex_direct",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-003",
|
||
"dimension": "preprocessing",
|
||
"category": "greeting",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "direct_chat",
|
||
"actual": "direct_chat",
|
||
"duration_ms": 0.0111,
|
||
"root_cause": "none",
|
||
"detail": "input='谢谢' method=regex_direct",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-004",
|
||
"dimension": "preprocessing",
|
||
"category": "greeting",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "direct_chat",
|
||
"actual": "direct_chat",
|
||
"duration_ms": 0.0115,
|
||
"root_cause": "none",
|
||
"detail": "input='你是谁' method=regex_direct",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-005",
|
||
"dimension": "preprocessing",
|
||
"category": "tool_query",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0136,
|
||
"root_cause": "none",
|
||
"detail": "input='搜索golang教程' method=default_react",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-006",
|
||
"dimension": "preprocessing",
|
||
"category": "tool_query",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0115,
|
||
"root_cause": "none",
|
||
"detail": "input='执行ls命令' method=default_react",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-007",
|
||
"dimension": "preprocessing",
|
||
"category": "tool_query",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0174,
|
||
"root_cause": "none",
|
||
"detail": "input='翻译hello为中文' method=default_react",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-008",
|
||
"dimension": "preprocessing",
|
||
"category": "tool_query",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0113,
|
||
"root_cause": "none",
|
||
"detail": "input='什么是机器学习' method=default_react",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-009",
|
||
"dimension": "preprocessing",
|
||
"category": "tool_query",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0109,
|
||
"root_cause": "none",
|
||
"detail": "input='帮我分析数据' method=default_react",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-010",
|
||
"dimension": "preprocessing",
|
||
"category": "skill_prefix",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "skill_react",
|
||
"actual": "skill_react",
|
||
"duration_ms": 0.0294,
|
||
"root_cause": "none",
|
||
"detail": "input='@skill:react_agent 查看ip' method=skill_prefix",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-011",
|
||
"dimension": "preprocessing",
|
||
"category": "skill_prefix",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "direct_chat",
|
||
"actual": "direct_chat",
|
||
"duration_ms": 0.0191,
|
||
"root_cause": "none",
|
||
"detail": "input='@skill:chat_only 你好' method=skill_prefix",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-012",
|
||
"dimension": "preprocessing",
|
||
"category": "skill_prefix",
|
||
"difficulty": "hard",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.1215,
|
||
"root_cause": "none",
|
||
"detail": "input='@skill:nonexistent 做点什么' method=skill_not_found_fallback",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-013",
|
||
"dimension": "preprocessing",
|
||
"category": "complex",
|
||
"difficulty": "hard",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0101,
|
||
"root_cause": "none",
|
||
"detail": "input='帮我分析这个数据并生成报告' method=default_react",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-014",
|
||
"dimension": "preprocessing",
|
||
"category": "complex",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0099,
|
||
"root_cause": "none",
|
||
"detail": "input='随便聊聊' method=default_react",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-015",
|
||
"dimension": "preprocessing",
|
||
"category": "complex",
|
||
"difficulty": "hard",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0128,
|
||
"root_cause": "none",
|
||
"detail": "input='请帮我完成以下任务:1. 查询天气 2. 生成报告' method=default_react",
|
||
"consistency": 1.0
|
||
}
|
||
]
|
||
},
|
||
"overfitting": {
|
||
"metrics": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.025,
|
||
"latency_p95_ms": 0.0557,
|
||
"latency_p99_ms": 0.0596,
|
||
"consistency": 1.0,
|
||
"total": 5,
|
||
"passed": 5,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.5655,
|
||
"ci_upper": 1.0
|
||
},
|
||
"by_category": {
|
||
"ip_check": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0362,
|
||
"latency_p95_ms": 0.0362,
|
||
"latency_p99_ms": 0.0362,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
},
|
||
"search": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0243,
|
||
"latency_p95_ms": 0.0243,
|
||
"latency_p99_ms": 0.0243,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
},
|
||
"greeting": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0606,
|
||
"latency_p95_ms": 0.0606,
|
||
"latency_p99_ms": 0.0606,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
},
|
||
"tool_use": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0233,
|
||
"latency_p95_ms": 0.0233,
|
||
"latency_p99_ms": 0.0233,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
},
|
||
"complex": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.025,
|
||
"latency_p95_ms": 0.025,
|
||
"latency_p99_ms": 0.025,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"by_difficulty": {
|
||
"medium": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0243,
|
||
"latency_p95_ms": 0.035,
|
||
"latency_p99_ms": 0.036,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
},
|
||
"easy": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0606,
|
||
"latency_p95_ms": 0.0606,
|
||
"latency_p99_ms": 0.0606,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
},
|
||
"hard": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.025,
|
||
"latency_p95_ms": 0.025,
|
||
"latency_p99_ms": 0.025,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"cases": [
|
||
{
|
||
"task_id": "over-001",
|
||
"dimension": "overfitting",
|
||
"category": "ip_check",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0362,
|
||
"root_cause": "none",
|
||
"detail": "paraphrases=5 modes=['react', 'react', 'react', 'react', 'react']",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "over-002",
|
||
"dimension": "overfitting",
|
||
"category": "search",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0243,
|
||
"root_cause": "none",
|
||
"detail": "paraphrases=3 modes=['react', 'react', 'react']",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "over-003",
|
||
"dimension": "overfitting",
|
||
"category": "greeting",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "direct_chat",
|
||
"actual": "direct_chat",
|
||
"duration_ms": 0.0606,
|
||
"root_cause": "none",
|
||
"detail": "paraphrases=5 modes=['direct_chat', 'direct_chat', 'direct_chat', 'direct_chat', 'direct_chat']",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "over-004",
|
||
"dimension": "overfitting",
|
||
"category": "tool_use",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0233,
|
||
"root_cause": "none",
|
||
"detail": "paraphrases=3 modes=['react', 'react', 'react']",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "over-005",
|
||
"dimension": "overfitting",
|
||
"category": "complex",
|
||
"difficulty": "hard",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.025,
|
||
"root_cause": "none",
|
||
"detail": "paraphrases=3 modes=['react', 'react', 'react']",
|
||
"consistency": 1.0
|
||
}
|
||
]
|
||
},
|
||
"efficiency": {
|
||
"metrics": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.33,
|
||
"latency_p95_ms": 0.622,
|
||
"latency_p99_ms": 0.6604,
|
||
"consistency": 1.0,
|
||
"total": 5,
|
||
"passed": 5,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.5655,
|
||
"ci_upper": 1.0
|
||
},
|
||
"by_category": {
|
||
"preprocess_latency": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.33,
|
||
"latency_p95_ms": 0.42,
|
||
"latency_p99_ms": 0.428,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
},
|
||
"tool_search_latency": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.355,
|
||
"latency_p95_ms": 0.6385,
|
||
"latency_p99_ms": 0.6637,
|
||
"consistency": 1.0,
|
||
"total": 2,
|
||
"passed": 2,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.3424,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"by_difficulty": {
|
||
"easy": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.165,
|
||
"latency_p95_ms": 0.2775,
|
||
"latency_p99_ms": 0.2875,
|
||
"consistency": 1.0,
|
||
"total": 2,
|
||
"passed": 2,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.3424,
|
||
"ci_upper": 1.0
|
||
},
|
||
"medium": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.43,
|
||
"latency_p95_ms": 0.646,
|
||
"latency_p99_ms": 0.6652,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"cases": [
|
||
{
|
||
"task_id": "eff-001",
|
||
"dimension": "efficiency",
|
||
"category": "preprocess_latency",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "<=50ms",
|
||
"actual": "0.003ms",
|
||
"duration_ms": 0.29,
|
||
"root_cause": "none",
|
||
"detail": "iterations=100 avg=0.003ms threshold=50.0ms",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "eff-002",
|
||
"dimension": "efficiency",
|
||
"category": "preprocess_latency",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "<=50ms",
|
||
"actual": "0.003ms",
|
||
"duration_ms": 0.33,
|
||
"root_cause": "none",
|
||
"detail": "iterations=100 avg=0.003ms threshold=50.0ms",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "eff-003",
|
||
"dimension": "efficiency",
|
||
"category": "preprocess_latency",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "<=50ms",
|
||
"actual": "0.004ms",
|
||
"duration_ms": 0.43,
|
||
"root_cause": "none",
|
||
"detail": "iterations=100 avg=0.004ms threshold=50.0ms",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "eff-004",
|
||
"dimension": "efficiency",
|
||
"category": "tool_search_latency",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "<=10ms",
|
||
"actual": "0.007ms",
|
||
"duration_ms": 0.67,
|
||
"root_cause": "none",
|
||
"detail": "iterations=100 avg=0.007ms threshold=10.0ms",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "eff-005",
|
||
"dimension": "efficiency",
|
||
"category": "tool_search_latency",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "<=5ms",
|
||
"actual": "0.000ms",
|
||
"duration_ms": 0.04,
|
||
"root_cause": "none",
|
||
"detail": "iterations=100 avg=0.000ms threshold=5.0ms",
|
||
"consistency": 1.0
|
||
}
|
||
]
|
||
},
|
||
"tool_search": {
|
||
"metrics": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.8333,
|
||
"recall": 0.8333,
|
||
"f1": 0.8333,
|
||
"latency_p50_ms": 0.0192,
|
||
"latency_p95_ms": 0.0278,
|
||
"latency_p99_ms": 0.0326,
|
||
"consistency": 1.0,
|
||
"total": 10,
|
||
"passed": 10,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.7225,
|
||
"ci_upper": 1.0
|
||
},
|
||
"by_category": {
|
||
"exact_match": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0199,
|
||
"latency_p95_ms": 0.0203,
|
||
"latency_p99_ms": 0.0204,
|
||
"consistency": 1.0,
|
||
"total": 5,
|
||
"passed": 5,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.5655,
|
||
"ci_upper": 1.0
|
||
},
|
||
"fuzzy_match": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0264,
|
||
"latency_p95_ms": 0.0331,
|
||
"latency_p99_ms": 0.0337,
|
||
"consistency": 1.0,
|
||
"total": 2,
|
||
"passed": 2,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.3424,
|
||
"ci_upper": 1.0
|
||
},
|
||
"no_match": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.0118,
|
||
"latency_p95_ms": 0.0122,
|
||
"latency_p99_ms": 0.0123,
|
||
"consistency": 1.0,
|
||
"total": 2,
|
||
"passed": 2,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.3424,
|
||
"ci_upper": 1.0
|
||
},
|
||
"top_k": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.016,
|
||
"latency_p95_ms": 0.016,
|
||
"latency_p99_ms": 0.016,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"by_difficulty": {
|
||
"easy": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.8333,
|
||
"recall": 0.8333,
|
||
"f1": 0.8333,
|
||
"latency_p50_ms": 0.0194,
|
||
"latency_p95_ms": 0.0203,
|
||
"latency_p99_ms": 0.0204,
|
||
"consistency": 1.0,
|
||
"total": 7,
|
||
"passed": 7,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.6457,
|
||
"ci_upper": 1.0
|
||
},
|
||
"medium": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.019,
|
||
"latency_p95_ms": 0.0323,
|
||
"latency_p99_ms": 0.0335,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"cases": [
|
||
{
|
||
"task_id": "ts-001",
|
||
"dimension": "tool_search",
|
||
"category": "exact_match",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "read_file",
|
||
"actual": "read_file",
|
||
"duration_ms": 0.0199,
|
||
"root_cause": "none",
|
||
"detail": "query='read file' top_k=5 results=2",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ts-002",
|
||
"dimension": "tool_search",
|
||
"category": "exact_match",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "write_file",
|
||
"actual": "write_file",
|
||
"duration_ms": 0.0204,
|
||
"root_cause": "none",
|
||
"detail": "query='write file content' top_k=5 results=2",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ts-003",
|
||
"dimension": "tool_search",
|
||
"category": "exact_match",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "web_search",
|
||
"actual": "web_search",
|
||
"duration_ms": 0.02,
|
||
"root_cause": "none",
|
||
"detail": "query='search web information' top_k=5 results=2",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ts-004",
|
||
"dimension": "tool_search",
|
||
"category": "exact_match",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "shell_exec",
|
||
"actual": "shell_exec",
|
||
"duration_ms": 0.018,
|
||
"root_cause": "none",
|
||
"detail": "query='execute shell command' top_k=5 results=1",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ts-005",
|
||
"dimension": "tool_search",
|
||
"category": "exact_match",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "http_request",
|
||
"actual": "http_request",
|
||
"duration_ms": 0.0194,
|
||
"root_cause": "none",
|
||
"detail": "query='send http request url' top_k=5 results=1",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ts-006",
|
||
"dimension": "tool_search",
|
||
"category": "fuzzy_match",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "read_file",
|
||
"actual": "read_file",
|
||
"duration_ms": 0.0338,
|
||
"root_cause": "none",
|
||
"detail": "query='io file' top_k=5 results=2",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ts-007",
|
||
"dimension": "tool_search",
|
||
"category": "fuzzy_match",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "web_search",
|
||
"actual": "web_search",
|
||
"duration_ms": 0.019,
|
||
"root_cause": "none",
|
||
"detail": "query='search query engine' top_k=5 results=1",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ts-008",
|
||
"dimension": "tool_search",
|
||
"category": "no_match",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "__none__",
|
||
"actual": "[]",
|
||
"duration_ms": 0.0112,
|
||
"root_cause": "none",
|
||
"detail": "query='' top_k=5 results=0",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ts-009",
|
||
"dimension": "tool_search",
|
||
"category": "no_match",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "__none__",
|
||
"actual": "[]",
|
||
"duration_ms": 0.0123,
|
||
"root_cause": "none",
|
||
"detail": "query='zzzznonexistent' top_k=5 results=0",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ts-010",
|
||
"dimension": "tool_search",
|
||
"category": "top_k",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "read_file",
|
||
"actual": "read_file",
|
||
"duration_ms": 0.016,
|
||
"root_cause": "none",
|
||
"detail": "query='file' top_k=1 results=1",
|
||
"consistency": 1.0
|
||
}
|
||
]
|
||
},
|
||
"event_model": {
|
||
"metrics": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.057,
|
||
"latency_p95_ms": 15.9984,
|
||
"latency_p99_ms": 20.2369,
|
||
"consistency": 1.0,
|
||
"total": 6,
|
||
"passed": 6,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.6097,
|
||
"ci_upper": 1.0
|
||
},
|
||
"by_category": {
|
||
"sq_lifecycle": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.046,
|
||
"latency_p95_ms": 0.0982,
|
||
"latency_p99_ms": 0.1028,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
},
|
||
"eq_lifecycle": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.0681,
|
||
"latency_p95_ms": 19.1737,
|
||
"latency_p99_ms": 20.8719,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"by_difficulty": {
|
||
"easy": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.057,
|
||
"latency_p95_ms": 15.9984,
|
||
"latency_p99_ms": 20.2369,
|
||
"consistency": 1.0,
|
||
"total": 6,
|
||
"passed": 6,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.6097,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"cases": [
|
||
{
|
||
"task_id": "ev-001",
|
||
"dimension": "event_model",
|
||
"category": "sq_lifecycle",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "drained=['hello']",
|
||
"duration_ms": 0.104,
|
||
"root_cause": "none",
|
||
"detail": "task_id=09dccea9...",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ev-002",
|
||
"dimension": "event_model",
|
||
"category": "sq_lifecycle",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "cancelled=True",
|
||
"duration_ms": 0.046,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ev-003",
|
||
"dimension": "event_model",
|
||
"category": "sq_lifecycle",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "raised=True closed=True",
|
||
"duration_ms": 0.0115,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ev-004",
|
||
"dimension": "event_model",
|
||
"category": "eq_lifecycle",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "received=1",
|
||
"duration_ms": 0.0681,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ev-005",
|
||
"dimension": "event_model",
|
||
"category": "eq_lifecycle",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "events=1 closed=True",
|
||
"duration_ms": 21.2965,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ev-006",
|
||
"dimension": "event_model",
|
||
"category": "eq_lifecycle",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "subscribers=0",
|
||
"duration_ms": 0.007,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
}
|
||
]
|
||
},
|
||
"spec_management": {
|
||
"metrics": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 1.3834,
|
||
"latency_p95_ms": 3.4578,
|
||
"latency_p99_ms": 4.0077,
|
||
"consistency": 1.0,
|
||
"total": 7,
|
||
"passed": 7,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.6457,
|
||
"ci_upper": 1.0
|
||
},
|
||
"by_category": {
|
||
"crud": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 1.3834,
|
||
"latency_p95_ms": 3.6044,
|
||
"latency_p99_ms": 4.037,
|
||
"consistency": 1.0,
|
||
"total": 5,
|
||
"passed": 5,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.5655,
|
||
"ci_upper": 1.0
|
||
},
|
||
"edge": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.9497,
|
||
"latency_p95_ms": 1.7635,
|
||
"latency_p99_ms": 1.8358,
|
||
"consistency": 1.0,
|
||
"total": 2,
|
||
"passed": 2,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.3424,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"by_difficulty": {
|
||
"easy": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 1.3659,
|
||
"latency_p95_ms": 3.4693,
|
||
"latency_p99_ms": 4.01,
|
||
"consistency": 1.0,
|
||
"total": 6,
|
||
"passed": 6,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.6097,
|
||
"ci_upper": 1.0
|
||
},
|
||
"medium": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 1.8539,
|
||
"latency_p95_ms": 1.8539,
|
||
"latency_p99_ms": 1.8539,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"cases": [
|
||
{
|
||
"task_id": "sm-001",
|
||
"dimension": "spec_management",
|
||
"category": "crud",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "exists=True",
|
||
"duration_ms": 1.3484,
|
||
"root_cause": "none",
|
||
"detail": "path=/var/folders/6b/ljk5bdq50yxcsth24frf05200000gn/T/agentkit-benchmark-wll_nqgl/run-0/specs/sm-001/test-spec.yaml",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "sm-002",
|
||
"dimension": "spec_management",
|
||
"category": "crud",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "steps=2",
|
||
"duration_ms": 1.3834,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "sm-003",
|
||
"dimension": "spec_management",
|
||
"category": "crud",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "goal=Updated goal",
|
||
"duration_ms": 1.4414,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "sm-004",
|
||
"dimension": "spec_management",
|
||
"category": "crud",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "deleted=True remaining=0",
|
||
"duration_ms": 1.0766,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "sm-005",
|
||
"dimension": "spec_management",
|
||
"category": "crud",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "count=2",
|
||
"duration_ms": 4.1452,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "sm-006",
|
||
"dimension": "spec_management",
|
||
"category": "edge",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "status=confirmed",
|
||
"duration_ms": 1.8539,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "sm-007",
|
||
"dimension": "spec_management",
|
||
"category": "edge",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "result=None",
|
||
"duration_ms": 0.0454,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
}
|
||
]
|
||
},
|
||
"verification": {
|
||
"metrics": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 22.0041,
|
||
"latency_p95_ms": 411.5705,
|
||
"latency_p99_ms": 487.0649,
|
||
"consistency": 1.0,
|
||
"total": 5,
|
||
"passed": 5,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.5655,
|
||
"ci_upper": 1.0
|
||
},
|
||
"by_category": {
|
||
"basic": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 11.4916,
|
||
"latency_p95_ms": 11.8303,
|
||
"latency_p99_ms": 11.8604,
|
||
"consistency": 1.0,
|
||
"total": 2,
|
||
"passed": 2,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.3424,
|
||
"ci_upper": 1.0
|
||
},
|
||
"retry": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 34.0985,
|
||
"latency_p95_ms": 34.0985,
|
||
"latency_p99_ms": 34.0985,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
},
|
||
"timeout": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 505.9385,
|
||
"latency_p95_ms": 505.9385,
|
||
"latency_p99_ms": 505.9385,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
},
|
||
"multi": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 22.0041,
|
||
"latency_p95_ms": 22.0041,
|
||
"latency_p99_ms": 22.0041,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"by_difficulty": {
|
||
"easy": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 11.4916,
|
||
"latency_p95_ms": 11.8303,
|
||
"latency_p99_ms": 11.8604,
|
||
"consistency": 1.0,
|
||
"total": 2,
|
||
"passed": 2,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.3424,
|
||
"ci_upper": 1.0
|
||
},
|
||
"medium": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 34.0985,
|
||
"latency_p95_ms": 458.7545,
|
||
"latency_p99_ms": 496.5017,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"cases": [
|
||
{
|
||
"task_id": "vf-001",
|
||
"dimension": "verification",
|
||
"category": "basic",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "passed=True attempts=1",
|
||
"duration_ms": 11.8679,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "vf-002",
|
||
"dimension": "verification",
|
||
"category": "basic",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "passed=False errors=1",
|
||
"duration_ms": 11.1154,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "vf-003",
|
||
"dimension": "verification",
|
||
"category": "retry",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "attempts=3 callbacks=2",
|
||
"duration_ms": 34.0985,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "vf-004",
|
||
"dimension": "verification",
|
||
"category": "timeout",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "passed=False errors=1",
|
||
"duration_ms": 505.9385,
|
||
"root_cause": "none",
|
||
"detail": "errors=['Command timed out after 0.5s: sleep 10']",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "vf-005",
|
||
"dimension": "verification",
|
||
"category": "multi",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "passed=False",
|
||
"duration_ms": 22.0041,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
}
|
||
]
|
||
},
|
||
"llm_reasoning": {
|
||
"metrics": {
|
||
"accuracy": 0.6,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 25149.4865,
|
||
"latency_p95_ms": 30001.1677,
|
||
"latency_p99_ms": 30001.2291,
|
||
"consistency": 1.0,
|
||
"total": 5,
|
||
"passed": 3,
|
||
"failed": 2,
|
||
"accuracy_mean": 0.6,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2307,
|
||
"ci_upper": 0.8824
|
||
},
|
||
"by_category": {
|
||
"intent_understanding": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 21288.4177,
|
||
"latency_p95_ms": 21288.4177,
|
||
"latency_p99_ms": 21288.4177,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
},
|
||
"tool_selection": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 5894.9682,
|
||
"latency_p95_ms": 5894.9682,
|
||
"latency_p99_ms": 5894.9682,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
},
|
||
"multi_step": {
|
||
"accuracy": 0.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 30000.8609,
|
||
"latency_p95_ms": 30000.8609,
|
||
"latency_p99_ms": 30000.8609,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 0,
|
||
"failed": 1,
|
||
"accuracy_mean": 0.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.0,
|
||
"ci_upper": 0.7935
|
||
},
|
||
"code_generation": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 25149.4865,
|
||
"latency_p95_ms": 25149.4865,
|
||
"latency_p99_ms": 25149.4865,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
},
|
||
"error_recovery": {
|
||
"accuracy": 0.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 30001.2444,
|
||
"latency_p95_ms": 30001.2444,
|
||
"latency_p99_ms": 30001.2444,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 0,
|
||
"failed": 1,
|
||
"accuracy_mean": 0.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.0,
|
||
"ci_upper": 0.7935
|
||
}
|
||
},
|
||
"by_difficulty": {
|
||
"easy": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 21288.4177,
|
||
"latency_p95_ms": 21288.4177,
|
||
"latency_p99_ms": 21288.4177,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
},
|
||
"medium": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 15522.2273,
|
||
"latency_p95_ms": 24186.7606,
|
||
"latency_p99_ms": 24956.9413,
|
||
"consistency": 1.0,
|
||
"total": 2,
|
||
"passed": 2,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.3424,
|
||
"ci_upper": 1.0
|
||
},
|
||
"hard": {
|
||
"accuracy": 0.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 30001.0526,
|
||
"latency_p95_ms": 30001.2252,
|
||
"latency_p99_ms": 30001.2406,
|
||
"consistency": 1.0,
|
||
"total": 2,
|
||
"passed": 0,
|
||
"failed": 2,
|
||
"accuracy_mean": 0.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.0,
|
||
"ci_upper": 0.6576
|
||
}
|
||
},
|
||
"cases": [
|
||
{
|
||
"task_id": "llm-001",
|
||
"dimension": "llm_reasoning",
|
||
"category": "intent_understanding",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "mode=react tokens=1116 len=974",
|
||
"duration_ms": 21288.4177,
|
||
"root_cause": "none",
|
||
"detail": "mode=react keywords=['ip', '地址', 'ifconfig', 'hostname', '网络']",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "llm-002",
|
||
"dimension": "llm_reasoning",
|
||
"category": "tool_selection",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "mode=react tokens=205 len=87",
|
||
"duration_ms": 5894.9682,
|
||
"root_cause": "none",
|
||
"detail": "mode=react keywords=['search', '搜索', 'web', '论文', 'paper', 'agent']",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "llm-003",
|
||
"dimension": "llm_reasoning",
|
||
"category": "multi_step",
|
||
"difficulty": "hard",
|
||
"passed": false,
|
||
"expected": "react",
|
||
"actual": "timeout",
|
||
"duration_ms": 30000.8609,
|
||
"root_cause": "timeout",
|
||
"detail": "LLM call timed out after 30s",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "llm-004",
|
||
"dimension": "llm_reasoning",
|
||
"category": "code_generation",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "mode=react tokens=1359 len=1001",
|
||
"duration_ms": 25149.4865,
|
||
"root_cause": "none",
|
||
"detail": "mode=react keywords=['def', 'fib', 'return', 'python']",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "llm-005",
|
||
"dimension": "llm_reasoning",
|
||
"category": "error_recovery",
|
||
"difficulty": "hard",
|
||
"passed": false,
|
||
"expected": "react",
|
||
"actual": "timeout",
|
||
"duration_ms": 30001.2444,
|
||
"root_cause": "timeout",
|
||
"detail": "LLM call timed out after 30s",
|
||
"consistency": 1.0
|
||
}
|
||
]
|
||
},
|
||
"gui_integration": {
|
||
"metrics": {
|
||
"accuracy": 0.8,
|
||
"precision": 0.8,
|
||
"recall": 0.8,
|
||
"f1": 0.8,
|
||
"latency_p50_ms": 0.0,
|
||
"latency_p95_ms": 0.0,
|
||
"latency_p99_ms": 0.0,
|
||
"consistency": 1.0,
|
||
"total": 5,
|
||
"passed": 4,
|
||
"failed": 1,
|
||
"accuracy_mean": 0.8,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.3755,
|
||
"ci_upper": 0.9638
|
||
},
|
||
"by_category": {
|
||
"service_startup": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0,
|
||
"latency_p95_ms": 0.0,
|
||
"latency_p99_ms": 0.0,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
},
|
||
"api_availability": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0,
|
||
"latency_p95_ms": 0.0,
|
||
"latency_p99_ms": 0.0,
|
||
"consistency": 1.0,
|
||
"total": 2,
|
||
"passed": 2,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.3424,
|
||
"ci_upper": 1.0
|
||
},
|
||
"websocket": {
|
||
"accuracy": 0.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.0,
|
||
"latency_p95_ms": 0.0,
|
||
"latency_p99_ms": 0.0,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 0,
|
||
"failed": 1,
|
||
"accuracy_mean": 0.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.0,
|
||
"ci_upper": 0.7935
|
||
},
|
||
"frontend": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0,
|
||
"latency_p95_ms": 0.0,
|
||
"latency_p99_ms": 0.0,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"by_difficulty": {
|
||
"easy": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0,
|
||
"latency_p95_ms": 0.0,
|
||
"latency_p99_ms": 0.0,
|
||
"consistency": 1.0,
|
||
"total": 2,
|
||
"passed": 2,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.3424,
|
||
"ci_upper": 1.0
|
||
},
|
||
"medium": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0,
|
||
"latency_p95_ms": 0.0,
|
||
"latency_p99_ms": 0.0,
|
||
"consistency": 1.0,
|
||
"total": 2,
|
||
"passed": 2,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.3424,
|
||
"ci_upper": 1.0
|
||
},
|
||
"hard": {
|
||
"accuracy": 0.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.0,
|
||
"latency_p95_ms": 0.0,
|
||
"latency_p99_ms": 0.0,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 0,
|
||
"failed": 1,
|
||
"accuracy_mean": 0.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.0,
|
||
"ci_upper": 0.7935
|
||
}
|
||
},
|
||
"cases": [
|
||
{
|
||
"task_id": "gui-001",
|
||
"dimension": "gui_integration",
|
||
"category": "service_startup",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "started",
|
||
"actual": "started",
|
||
"duration_ms": 0.0,
|
||
"root_cause": "none",
|
||
"detail": "port=64767 pid=20993",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "gui-002",
|
||
"dimension": "gui_integration",
|
||
"category": "api_availability",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "200",
|
||
"actual": "200",
|
||
"duration_ms": 0.0,
|
||
"root_cause": "none",
|
||
"detail": "health=200 skills=200",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "gui-003",
|
||
"dimension": "gui_integration",
|
||
"category": "api_availability",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "reachable",
|
||
"actual": "reachable",
|
||
"duration_ms": 0.0,
|
||
"root_cause": "none",
|
||
"detail": "status=405",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "gui-004",
|
||
"dimension": "gui_integration",
|
||
"category": "websocket",
|
||
"difficulty": "hard",
|
||
"passed": false,
|
||
"expected": "connected",
|
||
"actual": "failed",
|
||
"duration_ms": 0.0,
|
||
"root_cause": "gui_failure",
|
||
"detail": "error: server rejected WebSocket connection: HTTP 403",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "gui-005",
|
||
"dimension": "gui_integration",
|
||
"category": "frontend",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "html",
|
||
"actual": "html",
|
||
"duration_ms": 0.0,
|
||
"root_cause": "none",
|
||
"detail": "status=200 len=465",
|
||
"consistency": 1.0
|
||
}
|
||
]
|
||
}
|
||
},
|
||
"baseline_comparison": {
|
||
"status": "compared",
|
||
"dimensions": {
|
||
"preprocessing": {
|
||
"baseline_accuracy": 1.0,
|
||
"current_accuracy": 1.0,
|
||
"change": 0.0,
|
||
"direction": "—"
|
||
},
|
||
"overfitting": {
|
||
"baseline_accuracy": 1.0,
|
||
"current_accuracy": 1.0,
|
||
"change": 0.0,
|
||
"direction": "—"
|
||
},
|
||
"efficiency": {
|
||
"baseline_accuracy": 1.0,
|
||
"current_accuracy": 1.0,
|
||
"change": 0.0,
|
||
"direction": "—"
|
||
},
|
||
"tool_search": {
|
||
"baseline_accuracy": 1.0,
|
||
"current_accuracy": 1.0,
|
||
"change": 0.0,
|
||
"direction": "—"
|
||
},
|
||
"event_model": {
|
||
"baseline_accuracy": 1.0,
|
||
"current_accuracy": 1.0,
|
||
"change": 0.0,
|
||
"direction": "—"
|
||
},
|
||
"spec_management": {
|
||
"baseline_accuracy": 1.0,
|
||
"current_accuracy": 1.0,
|
||
"change": 0.0,
|
||
"direction": "—"
|
||
},
|
||
"verification": {
|
||
"baseline_accuracy": 1.0,
|
||
"current_accuracy": 1.0,
|
||
"change": 0.0,
|
||
"direction": "—"
|
||
},
|
||
"llm_reasoning": {
|
||
"baseline_accuracy": 0.0,
|
||
"current_accuracy": 0.6,
|
||
"change": 0.6,
|
||
"direction": "↑"
|
||
},
|
||
"gui_integration": {
|
||
"baseline_accuracy": 0.0,
|
||
"current_accuracy": 0.8,
|
||
"change": 0.8,
|
||
"direction": "↑"
|
||
}
|
||
}
|
||
}
|
||
} |