1918 lines
54 KiB
JSON
1918 lines
54 KiB
JSON
{
|
||
"timestamp": "2026-06-17T15:47:33.591101+00:00",
|
||
"version": "0.1.0",
|
||
"mode": "mock",
|
||
"runs": 1,
|
||
"fast": false,
|
||
"overall_accuracy": 1.0,
|
||
"overall_accuracy_mean": 1.0,
|
||
"overall_accuracy_std": 0.0,
|
||
"summary": "All 71 tests passed across 8 dimensions.",
|
||
"dimensions": {
|
||
"preprocessing": {
|
||
"metrics": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0072,
|
||
"latency_p95_ms": 0.0697,
|
||
"latency_p99_ms": 0.1071,
|
||
"consistency": 1.0,
|
||
"total": 15,
|
||
"passed": 15,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.7961,
|
||
"ci_upper": 1.0
|
||
},
|
||
"by_category": {
|
||
"greeting": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0105,
|
||
"latency_p95_ms": 0.0441,
|
||
"latency_p99_ms": 0.0485,
|
||
"consistency": 1.0,
|
||
"total": 4,
|
||
"passed": 4,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.5101,
|
||
"ci_upper": 1.0
|
||
},
|
||
"tool_query": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0048,
|
||
"latency_p95_ms": 0.0085,
|
||
"latency_p99_ms": 0.0089,
|
||
"consistency": 1.0,
|
||
"total": 5,
|
||
"passed": 5,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.5655,
|
||
"ci_upper": 1.0
|
||
},
|
||
"skill_prefix": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0195,
|
||
"latency_p95_ms": 0.1068,
|
||
"latency_p99_ms": 0.1146,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
},
|
||
"complex": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0045,
|
||
"latency_p95_ms": 0.0069,
|
||
"latency_p99_ms": 0.0071,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"by_difficulty": {
|
||
"easy": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0081,
|
||
"latency_p95_ms": 0.0423,
|
||
"latency_p99_ms": 0.0481,
|
||
"consistency": 1.0,
|
||
"total": 5,
|
||
"passed": 5,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.5655,
|
||
"ci_upper": 1.0
|
||
},
|
||
"medium": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0065,
|
||
"latency_p95_ms": 0.0178,
|
||
"latency_p99_ms": 0.0192,
|
||
"consistency": 1.0,
|
||
"total": 7,
|
||
"passed": 7,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.6457,
|
||
"ci_upper": 1.0
|
||
},
|
||
"hard": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0072,
|
||
"latency_p95_ms": 0.1056,
|
||
"latency_p99_ms": 0.1143,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"cases": [
|
||
{
|
||
"task_id": "prep-001",
|
||
"dimension": "preprocessing",
|
||
"category": "greeting",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "direct_chat",
|
||
"actual": "direct_chat",
|
||
"duration_ms": 0.0496,
|
||
"root_cause": "none",
|
||
"detail": "input='你好' method=regex_direct",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-002",
|
||
"dimension": "preprocessing",
|
||
"category": "greeting",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "direct_chat",
|
||
"actual": "direct_chat",
|
||
"duration_ms": 0.0129,
|
||
"root_cause": "none",
|
||
"detail": "input='hello' method=regex_direct",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-003",
|
||
"dimension": "preprocessing",
|
||
"category": "greeting",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "direct_chat",
|
||
"actual": "direct_chat",
|
||
"duration_ms": 0.0081,
|
||
"root_cause": "none",
|
||
"detail": "input='谢谢' method=regex_direct",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-004",
|
||
"dimension": "preprocessing",
|
||
"category": "greeting",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "direct_chat",
|
||
"actual": "direct_chat",
|
||
"duration_ms": 0.0064,
|
||
"root_cause": "none",
|
||
"detail": "input='你是谁' method=regex_direct",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-005",
|
||
"dimension": "preprocessing",
|
||
"category": "tool_query",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0065,
|
||
"root_cause": "none",
|
||
"detail": "input='搜索golang教程' method=default_react",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-006",
|
||
"dimension": "preprocessing",
|
||
"category": "tool_query",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0048,
|
||
"root_cause": "none",
|
||
"detail": "input='执行ls命令' method=default_react",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-007",
|
||
"dimension": "preprocessing",
|
||
"category": "tool_query",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0042,
|
||
"root_cause": "none",
|
||
"detail": "input='翻译hello为中文' method=default_react",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-008",
|
||
"dimension": "preprocessing",
|
||
"category": "tool_query",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.009,
|
||
"root_cause": "none",
|
||
"detail": "input='什么是机器学习' method=default_react",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-009",
|
||
"dimension": "preprocessing",
|
||
"category": "tool_query",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0043,
|
||
"root_cause": "none",
|
||
"detail": "input='帮我分析数据' method=default_react",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-010",
|
||
"dimension": "preprocessing",
|
||
"category": "skill_prefix",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "skill_react",
|
||
"actual": "skill_react",
|
||
"duration_ms": 0.0195,
|
||
"root_cause": "none",
|
||
"detail": "input='@skill:react_agent 查看ip' method=skill_prefix",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-011",
|
||
"dimension": "preprocessing",
|
||
"category": "skill_prefix",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "direct_chat",
|
||
"actual": "direct_chat",
|
||
"duration_ms": 0.0137,
|
||
"root_cause": "none",
|
||
"detail": "input='@skill:chat_only 你好' method=skill_prefix",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-012",
|
||
"dimension": "preprocessing",
|
||
"category": "skill_prefix",
|
||
"difficulty": "hard",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.1165,
|
||
"root_cause": "none",
|
||
"detail": "input='@skill:nonexistent 做点什么' method=skill_not_found_fallback",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-013",
|
||
"dimension": "preprocessing",
|
||
"category": "complex",
|
||
"difficulty": "hard",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0072,
|
||
"root_cause": "none",
|
||
"detail": "input='帮我分析这个数据并生成报告' method=default_react",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-014",
|
||
"dimension": "preprocessing",
|
||
"category": "complex",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0045,
|
||
"root_cause": "none",
|
||
"detail": "input='随便聊聊' method=default_react",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "prep-015",
|
||
"dimension": "preprocessing",
|
||
"category": "complex",
|
||
"difficulty": "hard",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0043,
|
||
"root_cause": "none",
|
||
"detail": "input='请帮我完成以下任务:1. 查询天气 2. 生成报告' method=default_react",
|
||
"consistency": 1.0
|
||
}
|
||
]
|
||
},
|
||
"overfitting": {
|
||
"metrics": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0132,
|
||
"latency_p95_ms": 0.0327,
|
||
"latency_p99_ms": 0.0347,
|
||
"consistency": 1.0,
|
||
"total": 5,
|
||
"passed": 5,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.5655,
|
||
"ci_upper": 1.0
|
||
},
|
||
"by_category": {
|
||
"ip_check": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0352,
|
||
"latency_p95_ms": 0.0352,
|
||
"latency_p99_ms": 0.0352,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
},
|
||
"search": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0132,
|
||
"latency_p95_ms": 0.0132,
|
||
"latency_p99_ms": 0.0132,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
},
|
||
"greeting": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0228,
|
||
"latency_p95_ms": 0.0228,
|
||
"latency_p99_ms": 0.0228,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
},
|
||
"tool_use": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0124,
|
||
"latency_p95_ms": 0.0124,
|
||
"latency_p99_ms": 0.0124,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
},
|
||
"complex": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0117,
|
||
"latency_p95_ms": 0.0117,
|
||
"latency_p99_ms": 0.0117,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"by_difficulty": {
|
||
"medium": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0132,
|
||
"latency_p95_ms": 0.033,
|
||
"latency_p99_ms": 0.0348,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
},
|
||
"easy": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0228,
|
||
"latency_p95_ms": 0.0228,
|
||
"latency_p99_ms": 0.0228,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
},
|
||
"hard": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0117,
|
||
"latency_p95_ms": 0.0117,
|
||
"latency_p99_ms": 0.0117,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"cases": [
|
||
{
|
||
"task_id": "over-001",
|
||
"dimension": "overfitting",
|
||
"category": "ip_check",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0352,
|
||
"root_cause": "none",
|
||
"detail": "paraphrases=5 modes=['react', 'react', 'react', 'react', 'react']",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "over-002",
|
||
"dimension": "overfitting",
|
||
"category": "search",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0132,
|
||
"root_cause": "none",
|
||
"detail": "paraphrases=3 modes=['react', 'react', 'react']",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "over-003",
|
||
"dimension": "overfitting",
|
||
"category": "greeting",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "direct_chat",
|
||
"actual": "direct_chat",
|
||
"duration_ms": 0.0228,
|
||
"root_cause": "none",
|
||
"detail": "paraphrases=5 modes=['direct_chat', 'direct_chat', 'direct_chat', 'direct_chat', 'direct_chat']",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "over-004",
|
||
"dimension": "overfitting",
|
||
"category": "tool_use",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0124,
|
||
"root_cause": "none",
|
||
"detail": "paraphrases=3 modes=['react', 'react', 'react']",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "over-005",
|
||
"dimension": "overfitting",
|
||
"category": "complex",
|
||
"difficulty": "hard",
|
||
"passed": true,
|
||
"expected": "react",
|
||
"actual": "react",
|
||
"duration_ms": 0.0117,
|
||
"root_cause": "none",
|
||
"detail": "paraphrases=3 modes=['react', 'react', 'react']",
|
||
"consistency": 1.0
|
||
}
|
||
]
|
||
},
|
||
"efficiency": {
|
||
"metrics": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.33,
|
||
"latency_p95_ms": 0.642,
|
||
"latency_p99_ms": 0.6724,
|
||
"consistency": 1.0,
|
||
"total": 5,
|
||
"passed": 5,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.5655,
|
||
"ci_upper": 1.0
|
||
},
|
||
"by_category": {
|
||
"preprocess_latency": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.33,
|
||
"latency_p95_ms": 0.474,
|
||
"latency_p99_ms": 0.4868,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
},
|
||
"tool_search_latency": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.36,
|
||
"latency_p95_ms": 0.648,
|
||
"latency_p99_ms": 0.6736,
|
||
"consistency": 1.0,
|
||
"total": 2,
|
||
"passed": 2,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.3424,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"by_difficulty": {
|
||
"easy": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.17,
|
||
"latency_p95_ms": 0.287,
|
||
"latency_p99_ms": 0.2974,
|
||
"consistency": 1.0,
|
||
"total": 2,
|
||
"passed": 2,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.3424,
|
||
"ci_upper": 1.0
|
||
},
|
||
"medium": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.49,
|
||
"latency_p95_ms": 0.661,
|
||
"latency_p99_ms": 0.6762,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"cases": [
|
||
{
|
||
"task_id": "eff-001",
|
||
"dimension": "efficiency",
|
||
"category": "preprocess_latency",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "<=50ms",
|
||
"actual": "0.003ms",
|
||
"duration_ms": 0.3,
|
||
"root_cause": "none",
|
||
"detail": "iterations=100 avg=0.003ms threshold=50.0ms",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "eff-002",
|
||
"dimension": "efficiency",
|
||
"category": "preprocess_latency",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "<=50ms",
|
||
"actual": "0.003ms",
|
||
"duration_ms": 0.33,
|
||
"root_cause": "none",
|
||
"detail": "iterations=100 avg=0.003ms threshold=50.0ms",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "eff-003",
|
||
"dimension": "efficiency",
|
||
"category": "preprocess_latency",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "<=50ms",
|
||
"actual": "0.005ms",
|
||
"duration_ms": 0.49,
|
||
"root_cause": "none",
|
||
"detail": "iterations=100 avg=0.005ms threshold=50.0ms",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "eff-004",
|
||
"dimension": "efficiency",
|
||
"category": "tool_search_latency",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "<=10ms",
|
||
"actual": "0.007ms",
|
||
"duration_ms": 0.68,
|
||
"root_cause": "none",
|
||
"detail": "iterations=100 avg=0.007ms threshold=10.0ms",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "eff-005",
|
||
"dimension": "efficiency",
|
||
"category": "tool_search_latency",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "<=5ms",
|
||
"actual": "0.000ms",
|
||
"duration_ms": 0.04,
|
||
"root_cause": "none",
|
||
"detail": "iterations=100 avg=0.000ms threshold=5.0ms",
|
||
"consistency": 1.0
|
||
}
|
||
]
|
||
},
|
||
"tool_search": {
|
||
"metrics": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.8333,
|
||
"recall": 0.8333,
|
||
"f1": 0.8333,
|
||
"latency_p50_ms": 0.0107,
|
||
"latency_p95_ms": 0.0193,
|
||
"latency_p99_ms": 0.0222,
|
||
"consistency": 1.0,
|
||
"total": 10,
|
||
"passed": 10,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.7225,
|
||
"ci_upper": 1.0
|
||
},
|
||
"by_category": {
|
||
"exact_match": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0125,
|
||
"latency_p95_ms": 0.0213,
|
||
"latency_p99_ms": 0.0226,
|
||
"consistency": 1.0,
|
||
"total": 5,
|
||
"passed": 5,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.5655,
|
||
"ci_upper": 1.0
|
||
},
|
||
"fuzzy_match": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.01,
|
||
"latency_p95_ms": 0.0102,
|
||
"latency_p99_ms": 0.0102,
|
||
"consistency": 1.0,
|
||
"total": 2,
|
||
"passed": 2,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.3424,
|
||
"ci_upper": 1.0
|
||
},
|
||
"no_match": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.0039,
|
||
"latency_p95_ms": 0.0062,
|
||
"latency_p99_ms": 0.0064,
|
||
"consistency": 1.0,
|
||
"total": 2,
|
||
"passed": 2,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.3424,
|
||
"ci_upper": 1.0
|
||
},
|
||
"top_k": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.008,
|
||
"latency_p95_ms": 0.008,
|
||
"latency_p99_ms": 0.008,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"by_difficulty": {
|
||
"easy": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.8333,
|
||
"recall": 0.8333,
|
||
"f1": 0.8333,
|
||
"latency_p50_ms": 0.0114,
|
||
"latency_p95_ms": 0.0205,
|
||
"latency_p99_ms": 0.0224,
|
||
"consistency": 1.0,
|
||
"total": 7,
|
||
"passed": 7,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.6457,
|
||
"ci_upper": 1.0
|
||
},
|
||
"medium": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0099,
|
||
"latency_p95_ms": 0.0102,
|
||
"latency_p99_ms": 0.0102,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"cases": [
|
||
{
|
||
"task_id": "ts-001",
|
||
"dimension": "tool_search",
|
||
"category": "exact_match",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "read_file",
|
||
"actual": "read_file",
|
||
"duration_ms": 0.0229,
|
||
"root_cause": "none",
|
||
"detail": "query='read file' top_k=5 results=2",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ts-002",
|
||
"dimension": "tool_search",
|
||
"category": "exact_match",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "write_file",
|
||
"actual": "write_file",
|
||
"duration_ms": 0.0148,
|
||
"root_cause": "none",
|
||
"detail": "query='write file content' top_k=5 results=2",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ts-003",
|
||
"dimension": "tool_search",
|
||
"category": "exact_match",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "web_search",
|
||
"actual": "web_search",
|
||
"duration_ms": 0.0125,
|
||
"root_cause": "none",
|
||
"detail": "query='search web information' top_k=5 results=2",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ts-004",
|
||
"dimension": "tool_search",
|
||
"category": "exact_match",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "shell_exec",
|
||
"actual": "shell_exec",
|
||
"duration_ms": 0.0112,
|
||
"root_cause": "none",
|
||
"detail": "query='execute shell command' top_k=5 results=1",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ts-005",
|
||
"dimension": "tool_search",
|
||
"category": "exact_match",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "http_request",
|
||
"actual": "http_request",
|
||
"duration_ms": 0.0114,
|
||
"root_cause": "none",
|
||
"detail": "query='send http request url' top_k=5 results=1",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ts-006",
|
||
"dimension": "tool_search",
|
||
"category": "fuzzy_match",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "read_file",
|
||
"actual": "read_file",
|
||
"duration_ms": 0.0102,
|
||
"root_cause": "none",
|
||
"detail": "query='io file' top_k=5 results=2",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ts-007",
|
||
"dimension": "tool_search",
|
||
"category": "fuzzy_match",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "web_search",
|
||
"actual": "web_search",
|
||
"duration_ms": 0.0099,
|
||
"root_cause": "none",
|
||
"detail": "query='search query engine' top_k=5 results=1",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ts-008",
|
||
"dimension": "tool_search",
|
||
"category": "no_match",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "__none__",
|
||
"actual": "[]",
|
||
"duration_ms": 0.0014,
|
||
"root_cause": "none",
|
||
"detail": "query='' top_k=5 results=0",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ts-009",
|
||
"dimension": "tool_search",
|
||
"category": "no_match",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "__none__",
|
||
"actual": "[]",
|
||
"duration_ms": 0.0065,
|
||
"root_cause": "none",
|
||
"detail": "query='zzzznonexistent' top_k=5 results=0",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ts-010",
|
||
"dimension": "tool_search",
|
||
"category": "top_k",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "read_file",
|
||
"actual": "read_file",
|
||
"duration_ms": 0.008,
|
||
"root_cause": "none",
|
||
"detail": "query='file' top_k=1 results=1",
|
||
"consistency": 1.0
|
||
}
|
||
]
|
||
},
|
||
"event_model": {
|
||
"metrics": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.0524,
|
||
"latency_p95_ms": 15.8743,
|
||
"latency_p99_ms": 20.0787,
|
||
"consistency": 1.0,
|
||
"total": 6,
|
||
"passed": 6,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.6097,
|
||
"ci_upper": 1.0
|
||
},
|
||
"by_category": {
|
||
"sq_lifecycle": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.0436,
|
||
"latency_p95_ms": 0.1013,
|
||
"latency_p99_ms": 0.1064,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
},
|
||
"eq_lifecycle": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.0613,
|
||
"latency_p95_ms": 19.0229,
|
||
"latency_p99_ms": 20.7084,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"by_difficulty": {
|
||
"easy": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.0524,
|
||
"latency_p95_ms": 15.8743,
|
||
"latency_p99_ms": 20.0787,
|
||
"consistency": 1.0,
|
||
"total": 6,
|
||
"passed": 6,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.6097,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"cases": [
|
||
{
|
||
"task_id": "ev-001",
|
||
"dimension": "event_model",
|
||
"category": "sq_lifecycle",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "drained=['hello']",
|
||
"duration_ms": 0.1077,
|
||
"root_cause": "none",
|
||
"detail": "task_id=0fd87910...",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ev-002",
|
||
"dimension": "event_model",
|
||
"category": "sq_lifecycle",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "cancelled=True",
|
||
"duration_ms": 0.0436,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ev-003",
|
||
"dimension": "event_model",
|
||
"category": "sq_lifecycle",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "raised=True closed=True",
|
||
"duration_ms": 0.0097,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ev-004",
|
||
"dimension": "event_model",
|
||
"category": "eq_lifecycle",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "received=1",
|
||
"duration_ms": 0.0613,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ev-005",
|
||
"dimension": "event_model",
|
||
"category": "eq_lifecycle",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "events=1 closed=True",
|
||
"duration_ms": 21.1298,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "ev-006",
|
||
"dimension": "event_model",
|
||
"category": "eq_lifecycle",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "subscribers=0",
|
||
"duration_ms": 0.0079,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
}
|
||
]
|
||
},
|
||
"spec_management": {
|
||
"metrics": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 1.9377,
|
||
"latency_p95_ms": 2.9432,
|
||
"latency_p99_ms": 3.2494,
|
||
"consistency": 1.0,
|
||
"total": 7,
|
||
"passed": 7,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.6457,
|
||
"ci_upper": 1.0
|
||
},
|
||
"by_category": {
|
||
"crud": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 2.0343,
|
||
"latency_p95_ms": 3.0707,
|
||
"latency_p99_ms": 3.2749,
|
||
"consistency": 1.0,
|
||
"total": 5,
|
||
"passed": 5,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.5655,
|
||
"ci_upper": 1.0
|
||
},
|
||
"edge": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.9924,
|
||
"latency_p95_ms": 1.8432,
|
||
"latency_p99_ms": 1.9188,
|
||
"consistency": 1.0,
|
||
"total": 2,
|
||
"passed": 2,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.3424,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"by_difficulty": {
|
||
"easy": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 1.7803,
|
||
"latency_p95_ms": 3.0069,
|
||
"latency_p99_ms": 3.2621,
|
||
"consistency": 1.0,
|
||
"total": 6,
|
||
"passed": 6,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.6097,
|
||
"ci_upper": 1.0
|
||
},
|
||
"medium": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 1.9377,
|
||
"latency_p95_ms": 1.9377,
|
||
"latency_p99_ms": 1.9377,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"cases": [
|
||
{
|
||
"task_id": "sm-001",
|
||
"dimension": "spec_management",
|
||
"category": "crud",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "exists=True",
|
||
"duration_ms": 2.0343,
|
||
"root_cause": "none",
|
||
"detail": "path=/var/folders/6b/ljk5bdq50yxcsth24frf05200000gn/T/agentkit-benchmark-idcioepn/run-0/specs/sm-001/test-spec.yaml",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "sm-002",
|
||
"dimension": "spec_management",
|
||
"category": "crud",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "steps=2",
|
||
"duration_ms": 2.0501,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "sm-003",
|
||
"dimension": "spec_management",
|
||
"category": "crud",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "goal=Updated goal",
|
||
"duration_ms": 1.5264,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "sm-004",
|
||
"dimension": "spec_management",
|
||
"category": "crud",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "deleted=True remaining=0",
|
||
"duration_ms": 1.3234,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "sm-005",
|
||
"dimension": "spec_management",
|
||
"category": "crud",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "count=2",
|
||
"duration_ms": 3.3259,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "sm-006",
|
||
"dimension": "spec_management",
|
||
"category": "edge",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "status=confirmed",
|
||
"duration_ms": 1.9377,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "sm-007",
|
||
"dimension": "spec_management",
|
||
"category": "edge",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "result=None",
|
||
"duration_ms": 0.0472,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
}
|
||
]
|
||
},
|
||
"verification": {
|
||
"metrics": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 22.2216,
|
||
"latency_p95_ms": 47.7927,
|
||
"latency_p99_ms": 50.9297,
|
||
"consistency": 1.0,
|
||
"total": 5,
|
||
"passed": 5,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.5655,
|
||
"ci_upper": 1.0
|
||
},
|
||
"by_category": {
|
||
"basic": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 16.9399,
|
||
"latency_p95_ms": 18.6778,
|
||
"latency_p99_ms": 18.8323,
|
||
"consistency": 1.0,
|
||
"total": 2,
|
||
"passed": 2,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.3424,
|
||
"ci_upper": 1.0
|
||
},
|
||
"retry": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 51.714,
|
||
"latency_p95_ms": 51.714,
|
||
"latency_p99_ms": 51.714,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
},
|
||
"timeout": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 0.0,
|
||
"latency_p95_ms": 0.0,
|
||
"latency_p99_ms": 0.0,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
},
|
||
"multi": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 25.5723,
|
||
"latency_p95_ms": 25.5723,
|
||
"latency_p99_ms": 25.5723,
|
||
"consistency": 1.0,
|
||
"total": 1,
|
||
"passed": 1,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.2065,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"by_difficulty": {
|
||
"easy": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 16.9399,
|
||
"latency_p95_ms": 18.6778,
|
||
"latency_p99_ms": 18.8323,
|
||
"consistency": 1.0,
|
||
"total": 2,
|
||
"passed": 2,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.3424,
|
||
"ci_upper": 1.0
|
||
},
|
||
"medium": {
|
||
"accuracy": 1.0,
|
||
"precision": 0.0,
|
||
"recall": 0.0,
|
||
"f1": 0.0,
|
||
"latency_p50_ms": 38.6431,
|
||
"latency_p95_ms": 50.4069,
|
||
"latency_p99_ms": 51.4526,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"cases": [
|
||
{
|
||
"task_id": "vf-001",
|
||
"dimension": "verification",
|
||
"category": "basic",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "passed=True attempts=1",
|
||
"duration_ms": 18.8709,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "vf-002",
|
||
"dimension": "verification",
|
||
"category": "basic",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "passed=False errors=1",
|
||
"duration_ms": 15.0089,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "vf-003",
|
||
"dimension": "verification",
|
||
"category": "retry",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "attempts=3 callbacks=2",
|
||
"duration_ms": 51.714,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "vf-004",
|
||
"dimension": "verification",
|
||
"category": "timeout",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "passed=False errors=1",
|
||
"duration_ms": 509.6538,
|
||
"root_cause": "none",
|
||
"detail": "timeout errors=['Command timed out after 0.5s: sleep 10']",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "vf-005",
|
||
"dimension": "verification",
|
||
"category": "multi",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "passed",
|
||
"actual": "passed=False",
|
||
"duration_ms": 25.5723,
|
||
"root_cause": "none",
|
||
"detail": "",
|
||
"consistency": 1.0
|
||
}
|
||
]
|
||
},
|
||
"board_meeting": {
|
||
"metrics": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0107,
|
||
"latency_p95_ms": 0.3934,
|
||
"latency_p99_ms": 1.1873,
|
||
"consistency": 1.0,
|
||
"total": 18,
|
||
"passed": 18,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.8241,
|
||
"ci_upper": 1.0
|
||
},
|
||
"by_category": {
|
||
"default_template": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0141,
|
||
"latency_p95_ms": 0.031,
|
||
"latency_p99_ms": 0.0325,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
},
|
||
"explicit_experts": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0138,
|
||
"latency_p95_ms": 0.0178,
|
||
"latency_p99_ms": 0.0181,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
},
|
||
"topic_extraction": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.005,
|
||
"latency_p95_ms": 0.0073,
|
||
"latency_p99_ms": 0.0075,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
},
|
||
"no_match": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0032,
|
||
"latency_p95_ms": 0.0032,
|
||
"latency_p99_ms": 0.0032,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
},
|
||
"name_validation": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0168,
|
||
"latency_p95_ms": 0.1981,
|
||
"latency_p99_ms": 0.2143,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
},
|
||
"stop_command": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0102,
|
||
"latency_p95_ms": 1.2482,
|
||
"latency_p99_ms": 1.3583,
|
||
"consistency": 1.0,
|
||
"total": 3,
|
||
"passed": 3,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.4385,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"by_difficulty": {
|
||
"easy": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.005,
|
||
"latency_p95_ms": 0.7093,
|
||
"latency_p99_ms": 1.2505,
|
||
"consistency": 1.0,
|
||
"total": 11,
|
||
"passed": 11,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.7412,
|
||
"ci_upper": 1.0
|
||
},
|
||
"medium": {
|
||
"accuracy": 1.0,
|
||
"precision": 1.0,
|
||
"recall": 1.0,
|
||
"f1": 1.0,
|
||
"latency_p50_ms": 0.0138,
|
||
"latency_p95_ms": 0.1583,
|
||
"latency_p99_ms": 0.2063,
|
||
"consistency": 1.0,
|
||
"total": 7,
|
||
"passed": 7,
|
||
"failed": 0,
|
||
"accuracy_mean": 1.0,
|
||
"accuracy_std": 0.0,
|
||
"ci_lower": 0.6457,
|
||
"ci_upper": 1.0
|
||
}
|
||
},
|
||
"cases": [
|
||
{
|
||
"task_id": "bd-001",
|
||
"dimension": "board_meeting",
|
||
"category": "default_template",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "board",
|
||
"actual": "board",
|
||
"duration_ms": 0.0329,
|
||
"root_cause": "none",
|
||
"detail": "matched=True board_mode=True use_default=True topic='讨论是否应该进入东南亚市场'",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "bd-002",
|
||
"dimension": "board_meeting",
|
||
"category": "default_template",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "board",
|
||
"actual": "board",
|
||
"duration_ms": 0.0141,
|
||
"root_cause": "none",
|
||
"detail": "matched=True board_mode=True use_default=True topic='AI产品定价策略应该怎么做'",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "bd-003",
|
||
"dimension": "board_meeting",
|
||
"category": "default_template",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "board",
|
||
"actual": "board",
|
||
"duration_ms": 0.0113,
|
||
"root_cause": "none",
|
||
"detail": "matched=True board_mode=True use_default=True topic='讨论创业公司融资节奏'",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "bd-004",
|
||
"dimension": "board_meeting",
|
||
"category": "explicit_experts",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "board",
|
||
"actual": "board",
|
||
"duration_ms": 0.0182,
|
||
"root_cause": "none",
|
||
"detail": "matched=True experts=['elon_musk', 'jeff_bezos'] use_default=False",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "bd-005",
|
||
"dimension": "board_meeting",
|
||
"category": "explicit_experts",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "board",
|
||
"actual": "board",
|
||
"duration_ms": 0.0112,
|
||
"root_cause": "none",
|
||
"detail": "matched=True experts=['charlie_munger', 'warren_buffett'] use_default=False",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "bd-006",
|
||
"dimension": "board_meeting",
|
||
"category": "explicit_experts",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "board",
|
||
"actual": "board",
|
||
"duration_ms": 0.0138,
|
||
"root_cause": "none",
|
||
"detail": "matched=True experts=['elon_musk', 'jeff_bezos', 'allenzhang'] use_default=False",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "bd-007",
|
||
"dimension": "board_meeting",
|
||
"category": "topic_extraction",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "讨论是否应该进入东南亚市场",
|
||
"actual": "讨论是否应该进入东南亚市场",
|
||
"duration_ms": 0.005,
|
||
"root_cause": "none",
|
||
"detail": "input='@board 讨论是否应该进入东南亚市场' topic='讨论是否应该进入东南亚市场' matched=True",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "bd-008",
|
||
"dimension": "board_meeting",
|
||
"category": "topic_extraction",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "火星商业化方案",
|
||
"actual": "火星商业化方案",
|
||
"duration_ms": 0.0076,
|
||
"root_cause": "none",
|
||
"detail": "input='@board:elon_musk,jeff_bezos 火星商业化方案' topic='火星商业化方案' matched=True",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "bd-009",
|
||
"dimension": "board_meeting",
|
||
"category": "topic_extraction",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "",
|
||
"actual": "",
|
||
"duration_ms": 0.0049,
|
||
"root_cause": "none",
|
||
"detail": "input='@board' topic='' matched=True",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "bd-010",
|
||
"dimension": "board_meeting",
|
||
"category": "no_match",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "not_board",
|
||
"actual": "not_board",
|
||
"duration_ms": 0.0032,
|
||
"root_cause": "none",
|
||
"detail": "input='讨论一下市场策略' matched=False board_mode=False",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "bd-011",
|
||
"dimension": "board_meeting",
|
||
"category": "no_match",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "not_board",
|
||
"actual": "not_board",
|
||
"duration_ms": 0.0032,
|
||
"root_cause": "none",
|
||
"detail": "input='@team:analyst,writer 协作完成任务' matched=False board_mode=False",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "bd-012",
|
||
"dimension": "board_meeting",
|
||
"category": "no_match",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "not_board",
|
||
"actual": "not_board",
|
||
"duration_ms": 0.0031,
|
||
"root_cause": "none",
|
||
"detail": "input='@skill:react_agent 查看ip' matched=False board_mode=False",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "bd-013",
|
||
"dimension": "board_meeting",
|
||
"category": "name_validation",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "2_valid",
|
||
"actual": "2_valid",
|
||
"duration_ms": 0.0103,
|
||
"root_cause": "none",
|
||
"detail": "input='@board:elon_musk,jeff_bezos 主题' experts=['elon_musk', 'jeff_bezos'] max=10",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "bd-014",
|
||
"dimension": "board_meeting",
|
||
"category": "name_validation",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "default_fallback",
|
||
"actual": "default_fallback",
|
||
"duration_ms": 0.2183,
|
||
"root_cause": "none",
|
||
"detail": "input='@board:@#$ 主题' experts=['elon_musk', 'jeff_bezos', 'allenzhang', 'charlie_munger', 'paul_graham'] max=10",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "bd-015",
|
||
"dimension": "board_meeting",
|
||
"category": "name_validation",
|
||
"difficulty": "medium",
|
||
"passed": true,
|
||
"expected": "10_capped",
|
||
"actual": "10_capped",
|
||
"duration_ms": 0.0168,
|
||
"root_cause": "none",
|
||
"detail": "input='@board:a,b,c,d,e,f,g,h,i,j,k 主题' experts=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] max=10",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "bd-016",
|
||
"dimension": "board_meeting",
|
||
"category": "stop_command",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "is_stop",
|
||
"actual": "is_stop",
|
||
"duration_ms": 1.3858,
|
||
"root_cause": "none",
|
||
"detail": "input='/stop' stop_commands=frozenset({'结束讨论', '停止讨论', 'stop', '/stop'})",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "bd-017",
|
||
"dimension": "board_meeting",
|
||
"category": "stop_command",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "is_stop",
|
||
"actual": "is_stop",
|
||
"duration_ms": 0.0102,
|
||
"root_cause": "none",
|
||
"detail": "input='停止讨论' stop_commands=frozenset({'结束讨论', '停止讨论', 'stop', '/stop'})",
|
||
"consistency": 1.0
|
||
},
|
||
{
|
||
"task_id": "bd-018",
|
||
"dimension": "board_meeting",
|
||
"category": "stop_command",
|
||
"difficulty": "easy",
|
||
"passed": true,
|
||
"expected": "not_stop",
|
||
"actual": "not_stop",
|
||
"duration_ms": 0.0022,
|
||
"root_cause": "none",
|
||
"detail": "input='继续讨论' stop_commands=frozenset({'结束讨论', '停止讨论', 'stop', '/stop'})",
|
||
"consistency": 1.0
|
||
}
|
||
]
|
||
}
|
||
}
|
||
} |