{ "timestamp": "2026-06-17T15:47:33.591101+00:00", "version": "0.1.0", "mode": "mock", "runs": 1, "fast": false, "overall_accuracy": 1.0, "overall_accuracy_mean": 1.0, "overall_accuracy_std": 0.0, "summary": "All 71 tests passed across 8 dimensions.", "dimensions": { "preprocessing": { "metrics": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0072, "latency_p95_ms": 0.0697, "latency_p99_ms": 0.1071, "consistency": 1.0, "total": 15, "passed": 15, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.7961, "ci_upper": 1.0 }, "by_category": { "greeting": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0105, "latency_p95_ms": 0.0441, "latency_p99_ms": 0.0485, "consistency": 1.0, "total": 4, "passed": 4, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5101, "ci_upper": 1.0 }, "tool_query": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0048, "latency_p95_ms": 0.0085, "latency_p99_ms": 0.0089, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "skill_prefix": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0195, "latency_p95_ms": 0.1068, "latency_p99_ms": 0.1146, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 }, "complex": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0045, "latency_p95_ms": 0.0069, "latency_p99_ms": 0.0071, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0081, "latency_p95_ms": 0.0423, "latency_p99_ms": 0.0481, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "medium": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0065, "latency_p95_ms": 0.0178, "latency_p99_ms": 0.0192, "consistency": 1.0, "total": 7, "passed": 7, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.6457, "ci_upper": 1.0 }, "hard": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0072, "latency_p95_ms": 0.1056, "latency_p99_ms": 0.1143, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 } }, "cases": [ { "task_id": "prep-001", "dimension": "preprocessing", "category": "greeting", "difficulty": "easy", "passed": true, "expected": "direct_chat", "actual": "direct_chat", "duration_ms": 0.0496, "root_cause": "none", "detail": "input='你好' method=regex_direct", "consistency": 1.0 }, { "task_id": "prep-002", "dimension": "preprocessing", "category": "greeting", "difficulty": "easy", "passed": true, "expected": "direct_chat", "actual": "direct_chat", "duration_ms": 0.0129, "root_cause": "none", "detail": "input='hello' method=regex_direct", "consistency": 1.0 }, { "task_id": "prep-003", "dimension": "preprocessing", "category": "greeting", "difficulty": "easy", "passed": true, "expected": "direct_chat", "actual": "direct_chat", "duration_ms": 0.0081, "root_cause": "none", "detail": "input='谢谢' method=regex_direct", "consistency": 1.0 }, { "task_id": "prep-004", "dimension": "preprocessing", "category": "greeting", "difficulty": "easy", "passed": true, "expected": "direct_chat", "actual": "direct_chat", "duration_ms": 0.0064, "root_cause": "none", "detail": "input='你是谁' method=regex_direct", "consistency": 1.0 }, { "task_id": "prep-005", "dimension": "preprocessing", "category": "tool_query", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0065, "root_cause": "none", "detail": "input='搜索golang教程' method=default_react", "consistency": 1.0 }, { "task_id": "prep-006", "dimension": "preprocessing", "category": "tool_query", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0048, "root_cause": "none", "detail": "input='执行ls命令' method=default_react", "consistency": 1.0 }, { "task_id": "prep-007", "dimension": "preprocessing", "category": "tool_query", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0042, "root_cause": "none", "detail": "input='翻译hello为中文' method=default_react", "consistency": 1.0 }, { "task_id": "prep-008", "dimension": "preprocessing", "category": "tool_query", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.009, "root_cause": "none", "detail": "input='什么是机器学习' method=default_react", "consistency": 1.0 }, { "task_id": "prep-009", "dimension": "preprocessing", "category": "tool_query", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0043, "root_cause": "none", "detail": "input='帮我分析数据' method=default_react", "consistency": 1.0 }, { "task_id": "prep-010", "dimension": "preprocessing", "category": "skill_prefix", "difficulty": "medium", "passed": true, "expected": "skill_react", "actual": "skill_react", "duration_ms": 0.0195, "root_cause": "none", "detail": "input='@skill:react_agent 查看ip' method=skill_prefix", "consistency": 1.0 }, { "task_id": "prep-011", "dimension": "preprocessing", "category": "skill_prefix", "difficulty": "medium", "passed": true, "expected": "direct_chat", "actual": "direct_chat", "duration_ms": 0.0137, "root_cause": "none", "detail": "input='@skill:chat_only 你好' method=skill_prefix", "consistency": 1.0 }, { "task_id": "prep-012", "dimension": "preprocessing", "category": "skill_prefix", "difficulty": "hard", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.1165, "root_cause": "none", "detail": "input='@skill:nonexistent 做点什么' method=skill_not_found_fallback", "consistency": 1.0 }, { "task_id": "prep-013", "dimension": "preprocessing", "category": "complex", "difficulty": "hard", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0072, "root_cause": "none", "detail": "input='帮我分析这个数据并生成报告' method=default_react", "consistency": 1.0 }, { "task_id": "prep-014", "dimension": "preprocessing", "category": "complex", "difficulty": "easy", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0045, "root_cause": "none", "detail": "input='随便聊聊' method=default_react", "consistency": 1.0 }, { "task_id": "prep-015", "dimension": "preprocessing", "category": "complex", "difficulty": "hard", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0043, "root_cause": "none", "detail": "input='请帮我完成以下任务:1. 查询天气 2. 生成报告' method=default_react", "consistency": 1.0 } ] }, "overfitting": { "metrics": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0132, "latency_p95_ms": 0.0327, "latency_p99_ms": 0.0347, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "by_category": { "ip_check": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0352, "latency_p95_ms": 0.0352, "latency_p99_ms": 0.0352, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "search": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0132, "latency_p95_ms": 0.0132, "latency_p99_ms": 0.0132, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "greeting": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0228, "latency_p95_ms": 0.0228, "latency_p99_ms": 0.0228, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "tool_use": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0124, "latency_p95_ms": 0.0124, "latency_p99_ms": 0.0124, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "complex": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0117, "latency_p95_ms": 0.0117, "latency_p99_ms": 0.0117, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 } }, "by_difficulty": { "medium": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0132, "latency_p95_ms": 0.033, "latency_p99_ms": 0.0348, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 }, "easy": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0228, "latency_p95_ms": 0.0228, "latency_p99_ms": 0.0228, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "hard": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0117, "latency_p95_ms": 0.0117, "latency_p99_ms": 0.0117, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 } }, "cases": [ { "task_id": "over-001", "dimension": "overfitting", "category": "ip_check", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0352, "root_cause": "none", "detail": "paraphrases=5 modes=['react', 'react', 'react', 'react', 'react']", "consistency": 1.0 }, { "task_id": "over-002", "dimension": "overfitting", "category": "search", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0132, "root_cause": "none", "detail": "paraphrases=3 modes=['react', 'react', 'react']", "consistency": 1.0 }, { "task_id": "over-003", "dimension": "overfitting", "category": "greeting", "difficulty": "easy", "passed": true, "expected": "direct_chat", "actual": "direct_chat", "duration_ms": 0.0228, "root_cause": "none", "detail": "paraphrases=5 modes=['direct_chat', 'direct_chat', 'direct_chat', 'direct_chat', 'direct_chat']", "consistency": 1.0 }, { "task_id": "over-004", "dimension": "overfitting", "category": "tool_use", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0124, "root_cause": "none", "detail": "paraphrases=3 modes=['react', 'react', 'react']", "consistency": 1.0 }, { "task_id": "over-005", "dimension": "overfitting", "category": "complex", "difficulty": "hard", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0117, "root_cause": "none", "detail": "paraphrases=3 modes=['react', 'react', 'react']", "consistency": 1.0 } ] }, "efficiency": { "metrics": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.33, "latency_p95_ms": 0.642, "latency_p99_ms": 0.6724, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "by_category": { "preprocess_latency": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.33, "latency_p95_ms": 0.474, "latency_p99_ms": 0.4868, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 }, "tool_search_latency": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.36, "latency_p95_ms": 0.648, "latency_p99_ms": 0.6736, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.17, "latency_p95_ms": 0.287, "latency_p99_ms": 0.2974, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 }, "medium": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.49, "latency_p95_ms": 0.661, "latency_p99_ms": 0.6762, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 } }, "cases": [ { "task_id": "eff-001", "dimension": "efficiency", "category": "preprocess_latency", "difficulty": "easy", "passed": true, "expected": "<=50ms", "actual": "0.003ms", "duration_ms": 0.3, "root_cause": "none", "detail": "iterations=100 avg=0.003ms threshold=50.0ms", "consistency": 1.0 }, { "task_id": "eff-002", "dimension": "efficiency", "category": "preprocess_latency", "difficulty": "medium", "passed": true, "expected": "<=50ms", "actual": "0.003ms", "duration_ms": 0.33, "root_cause": "none", "detail": "iterations=100 avg=0.003ms threshold=50.0ms", "consistency": 1.0 }, { "task_id": "eff-003", "dimension": "efficiency", "category": "preprocess_latency", "difficulty": "medium", "passed": true, "expected": "<=50ms", "actual": "0.005ms", "duration_ms": 0.49, "root_cause": "none", "detail": "iterations=100 avg=0.005ms threshold=50.0ms", "consistency": 1.0 }, { "task_id": "eff-004", "dimension": "efficiency", "category": "tool_search_latency", "difficulty": "medium", "passed": true, "expected": "<=10ms", "actual": "0.007ms", "duration_ms": 0.68, "root_cause": "none", "detail": "iterations=100 avg=0.007ms threshold=10.0ms", "consistency": 1.0 }, { "task_id": "eff-005", "dimension": "efficiency", "category": "tool_search_latency", "difficulty": "easy", "passed": true, "expected": "<=5ms", "actual": "0.000ms", "duration_ms": 0.04, "root_cause": "none", "detail": "iterations=100 avg=0.000ms threshold=5.0ms", "consistency": 1.0 } ] }, "tool_search": { "metrics": { "accuracy": 1.0, "precision": 0.8333, "recall": 0.8333, "f1": 0.8333, "latency_p50_ms": 0.0107, "latency_p95_ms": 0.0193, "latency_p99_ms": 0.0222, "consistency": 1.0, "total": 10, "passed": 10, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.7225, "ci_upper": 1.0 }, "by_category": { "exact_match": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0125, "latency_p95_ms": 0.0213, "latency_p99_ms": 0.0226, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "fuzzy_match": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.01, "latency_p95_ms": 0.0102, "latency_p99_ms": 0.0102, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 }, "no_match": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.0039, "latency_p95_ms": 0.0062, "latency_p99_ms": 0.0064, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 }, "top_k": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.008, "latency_p95_ms": 0.008, "latency_p99_ms": 0.008, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 0.8333, "recall": 0.8333, "f1": 0.8333, "latency_p50_ms": 0.0114, "latency_p95_ms": 0.0205, "latency_p99_ms": 0.0224, "consistency": 1.0, "total": 7, "passed": 7, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.6457, "ci_upper": 1.0 }, "medium": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0099, "latency_p95_ms": 0.0102, "latency_p99_ms": 0.0102, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 } }, "cases": [ { "task_id": "ts-001", "dimension": "tool_search", "category": "exact_match", "difficulty": "easy", "passed": true, "expected": "read_file", "actual": "read_file", "duration_ms": 0.0229, "root_cause": "none", "detail": "query='read file' top_k=5 results=2", "consistency": 1.0 }, { "task_id": "ts-002", "dimension": "tool_search", "category": "exact_match", "difficulty": "easy", "passed": true, "expected": "write_file", "actual": "write_file", "duration_ms": 0.0148, "root_cause": "none", "detail": "query='write file content' top_k=5 results=2", "consistency": 1.0 }, { "task_id": "ts-003", "dimension": "tool_search", "category": "exact_match", "difficulty": "easy", "passed": true, "expected": "web_search", "actual": "web_search", "duration_ms": 0.0125, "root_cause": "none", "detail": "query='search web information' top_k=5 results=2", "consistency": 1.0 }, { "task_id": "ts-004", "dimension": "tool_search", "category": "exact_match", "difficulty": "easy", "passed": true, "expected": "shell_exec", "actual": "shell_exec", "duration_ms": 0.0112, "root_cause": "none", "detail": "query='execute shell command' top_k=5 results=1", "consistency": 1.0 }, { "task_id": "ts-005", "dimension": "tool_search", "category": "exact_match", "difficulty": "easy", "passed": true, "expected": "http_request", "actual": "http_request", "duration_ms": 0.0114, "root_cause": "none", "detail": "query='send http request url' top_k=5 results=1", "consistency": 1.0 }, { "task_id": "ts-006", "dimension": "tool_search", "category": "fuzzy_match", "difficulty": "medium", "passed": true, "expected": "read_file", "actual": "read_file", "duration_ms": 0.0102, "root_cause": "none", "detail": "query='io file' top_k=5 results=2", "consistency": 1.0 }, { "task_id": "ts-007", "dimension": "tool_search", "category": "fuzzy_match", "difficulty": "medium", "passed": true, "expected": "web_search", "actual": "web_search", "duration_ms": 0.0099, "root_cause": "none", "detail": "query='search query engine' top_k=5 results=1", "consistency": 1.0 }, { "task_id": "ts-008", "dimension": "tool_search", "category": "no_match", "difficulty": "easy", "passed": true, "expected": "__none__", "actual": "[]", "duration_ms": 0.0014, "root_cause": "none", "detail": "query='' top_k=5 results=0", "consistency": 1.0 }, { "task_id": "ts-009", "dimension": "tool_search", "category": "no_match", "difficulty": "easy", "passed": true, "expected": "__none__", "actual": "[]", "duration_ms": 0.0065, "root_cause": "none", "detail": "query='zzzznonexistent' top_k=5 results=0", "consistency": 1.0 }, { "task_id": "ts-010", "dimension": "tool_search", "category": "top_k", "difficulty": "medium", "passed": true, "expected": "read_file", "actual": "read_file", "duration_ms": 0.008, "root_cause": "none", "detail": "query='file' top_k=1 results=1", "consistency": 1.0 } ] }, "event_model": { "metrics": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.0524, "latency_p95_ms": 15.8743, "latency_p99_ms": 20.0787, "consistency": 1.0, "total": 6, "passed": 6, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.6097, "ci_upper": 1.0 }, "by_category": { "sq_lifecycle": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.0436, "latency_p95_ms": 0.1013, "latency_p99_ms": 0.1064, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 }, "eq_lifecycle": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.0613, "latency_p95_ms": 19.0229, "latency_p99_ms": 20.7084, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.0524, "latency_p95_ms": 15.8743, "latency_p99_ms": 20.0787, "consistency": 1.0, "total": 6, "passed": 6, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.6097, "ci_upper": 1.0 } }, "cases": [ { "task_id": "ev-001", "dimension": "event_model", "category": "sq_lifecycle", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "drained=['hello']", "duration_ms": 0.1077, "root_cause": "none", "detail": "task_id=0fd87910...", "consistency": 1.0 }, { "task_id": "ev-002", "dimension": "event_model", "category": "sq_lifecycle", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "cancelled=True", "duration_ms": 0.0436, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "ev-003", "dimension": "event_model", "category": "sq_lifecycle", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "raised=True closed=True", "duration_ms": 0.0097, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "ev-004", "dimension": "event_model", "category": "eq_lifecycle", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "received=1", "duration_ms": 0.0613, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "ev-005", "dimension": "event_model", "category": "eq_lifecycle", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "events=1 closed=True", "duration_ms": 21.1298, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "ev-006", "dimension": "event_model", "category": "eq_lifecycle", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "subscribers=0", "duration_ms": 0.0079, "root_cause": "none", "detail": "", "consistency": 1.0 } ] }, "spec_management": { "metrics": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 1.9377, "latency_p95_ms": 2.9432, "latency_p99_ms": 3.2494, "consistency": 1.0, "total": 7, "passed": 7, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.6457, "ci_upper": 1.0 }, "by_category": { "crud": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 2.0343, "latency_p95_ms": 3.0707, "latency_p99_ms": 3.2749, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "edge": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.9924, "latency_p95_ms": 1.8432, "latency_p99_ms": 1.9188, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 1.7803, "latency_p95_ms": 3.0069, "latency_p99_ms": 3.2621, "consistency": 1.0, "total": 6, "passed": 6, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.6097, "ci_upper": 1.0 }, "medium": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 1.9377, "latency_p95_ms": 1.9377, "latency_p99_ms": 1.9377, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 } }, "cases": [ { "task_id": "sm-001", "dimension": "spec_management", "category": "crud", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "exists=True", "duration_ms": 2.0343, "root_cause": "none", "detail": "path=/var/folders/6b/ljk5bdq50yxcsth24frf05200000gn/T/agentkit-benchmark-idcioepn/run-0/specs/sm-001/test-spec.yaml", "consistency": 1.0 }, { "task_id": "sm-002", "dimension": "spec_management", "category": "crud", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "steps=2", "duration_ms": 2.0501, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "sm-003", "dimension": "spec_management", "category": "crud", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "goal=Updated goal", "duration_ms": 1.5264, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "sm-004", "dimension": "spec_management", "category": "crud", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "deleted=True remaining=0", "duration_ms": 1.3234, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "sm-005", "dimension": "spec_management", "category": "crud", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "count=2", "duration_ms": 3.3259, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "sm-006", "dimension": "spec_management", "category": "edge", "difficulty": "medium", "passed": true, "expected": "passed", "actual": "status=confirmed", "duration_ms": 1.9377, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "sm-007", "dimension": "spec_management", "category": "edge", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "result=None", "duration_ms": 0.0472, "root_cause": "none", "detail": "", "consistency": 1.0 } ] }, "verification": { "metrics": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 22.2216, "latency_p95_ms": 47.7927, "latency_p99_ms": 50.9297, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "by_category": { "basic": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 16.9399, "latency_p95_ms": 18.6778, "latency_p99_ms": 18.8323, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 }, "retry": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 51.714, "latency_p95_ms": 51.714, "latency_p99_ms": 51.714, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "timeout": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.0, "latency_p95_ms": 0.0, "latency_p99_ms": 0.0, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "multi": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 25.5723, "latency_p95_ms": 25.5723, "latency_p99_ms": 25.5723, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 16.9399, "latency_p95_ms": 18.6778, "latency_p99_ms": 18.8323, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 }, "medium": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 38.6431, "latency_p95_ms": 50.4069, "latency_p99_ms": 51.4526, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 } }, "cases": [ { "task_id": "vf-001", "dimension": "verification", "category": "basic", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "passed=True attempts=1", "duration_ms": 18.8709, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "vf-002", "dimension": "verification", "category": "basic", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "passed=False errors=1", "duration_ms": 15.0089, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "vf-003", "dimension": "verification", "category": "retry", "difficulty": "medium", "passed": true, "expected": "passed", "actual": "attempts=3 callbacks=2", "duration_ms": 51.714, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "vf-004", "dimension": "verification", "category": "timeout", "difficulty": "medium", "passed": true, "expected": "passed", "actual": "passed=False errors=1", "duration_ms": 509.6538, "root_cause": "none", "detail": "timeout errors=['Command timed out after 0.5s: sleep 10']", "consistency": 1.0 }, { "task_id": "vf-005", "dimension": "verification", "category": "multi", "difficulty": "medium", "passed": true, "expected": "passed", "actual": "passed=False", "duration_ms": 25.5723, "root_cause": "none", "detail": "", "consistency": 1.0 } ] }, "board_meeting": { "metrics": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0107, "latency_p95_ms": 0.3934, "latency_p99_ms": 1.1873, "consistency": 1.0, "total": 18, "passed": 18, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.8241, "ci_upper": 1.0 }, "by_category": { "default_template": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0141, "latency_p95_ms": 0.031, "latency_p99_ms": 0.0325, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 }, "explicit_experts": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0138, "latency_p95_ms": 0.0178, "latency_p99_ms": 0.0181, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 }, "topic_extraction": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.005, "latency_p95_ms": 0.0073, "latency_p99_ms": 0.0075, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 }, "no_match": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0032, "latency_p95_ms": 0.0032, "latency_p99_ms": 0.0032, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 }, "name_validation": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0168, "latency_p95_ms": 0.1981, "latency_p99_ms": 0.2143, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 }, "stop_command": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0102, "latency_p95_ms": 1.2482, "latency_p99_ms": 1.3583, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.005, "latency_p95_ms": 0.7093, "latency_p99_ms": 1.2505, "consistency": 1.0, "total": 11, "passed": 11, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.7412, "ci_upper": 1.0 }, "medium": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0138, "latency_p95_ms": 0.1583, "latency_p99_ms": 0.2063, "consistency": 1.0, "total": 7, "passed": 7, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.6457, "ci_upper": 1.0 } }, "cases": [ { "task_id": "bd-001", "dimension": "board_meeting", "category": "default_template", "difficulty": "easy", "passed": true, "expected": "board", "actual": "board", "duration_ms": 0.0329, "root_cause": "none", "detail": "matched=True board_mode=True use_default=True topic='讨论是否应该进入东南亚市场'", "consistency": 1.0 }, { "task_id": "bd-002", "dimension": "board_meeting", "category": "default_template", "difficulty": "easy", "passed": true, "expected": "board", "actual": "board", "duration_ms": 0.0141, "root_cause": "none", "detail": "matched=True board_mode=True use_default=True topic='AI产品定价策略应该怎么做'", "consistency": 1.0 }, { "task_id": "bd-003", "dimension": "board_meeting", "category": "default_template", "difficulty": "medium", "passed": true, "expected": "board", "actual": "board", "duration_ms": 0.0113, "root_cause": "none", "detail": "matched=True board_mode=True use_default=True topic='讨论创业公司融资节奏'", "consistency": 1.0 }, { "task_id": "bd-004", "dimension": "board_meeting", "category": "explicit_experts", "difficulty": "medium", "passed": true, "expected": "board", "actual": "board", "duration_ms": 0.0182, "root_cause": "none", "detail": "matched=True experts=['elon_musk', 'jeff_bezos'] use_default=False", "consistency": 1.0 }, { "task_id": "bd-005", "dimension": "board_meeting", "category": "explicit_experts", "difficulty": "medium", "passed": true, "expected": "board", "actual": "board", "duration_ms": 0.0112, "root_cause": "none", "detail": "matched=True experts=['charlie_munger', 'warren_buffett'] use_default=False", "consistency": 1.0 }, { "task_id": "bd-006", "dimension": "board_meeting", "category": "explicit_experts", "difficulty": "medium", "passed": true, "expected": "board", "actual": "board", "duration_ms": 0.0138, "root_cause": "none", "detail": "matched=True experts=['elon_musk', 'jeff_bezos', 'allenzhang'] use_default=False", "consistency": 1.0 }, { "task_id": "bd-007", "dimension": "board_meeting", "category": "topic_extraction", "difficulty": "easy", "passed": true, "expected": "讨论是否应该进入东南亚市场", "actual": "讨论是否应该进入东南亚市场", "duration_ms": 0.005, "root_cause": "none", "detail": "input='@board 讨论是否应该进入东南亚市场' topic='讨论是否应该进入东南亚市场' matched=True", "consistency": 1.0 }, { "task_id": "bd-008", "dimension": "board_meeting", "category": "topic_extraction", "difficulty": "easy", "passed": true, "expected": "火星商业化方案", "actual": "火星商业化方案", "duration_ms": 0.0076, "root_cause": "none", "detail": "input='@board:elon_musk,jeff_bezos 火星商业化方案' topic='火星商业化方案' matched=True", "consistency": 1.0 }, { "task_id": "bd-009", "dimension": "board_meeting", "category": "topic_extraction", "difficulty": "easy", "passed": true, "expected": "", "actual": "", "duration_ms": 0.0049, "root_cause": "none", "detail": "input='@board' topic='' matched=True", "consistency": 1.0 }, { "task_id": "bd-010", "dimension": "board_meeting", "category": "no_match", "difficulty": "easy", "passed": true, "expected": "not_board", "actual": "not_board", "duration_ms": 0.0032, "root_cause": "none", "detail": "input='讨论一下市场策略' matched=False board_mode=False", "consistency": 1.0 }, { "task_id": "bd-011", "dimension": "board_meeting", "category": "no_match", "difficulty": "easy", "passed": true, "expected": "not_board", "actual": "not_board", "duration_ms": 0.0032, "root_cause": "none", "detail": "input='@team:analyst,writer 协作完成任务' matched=False board_mode=False", "consistency": 1.0 }, { "task_id": "bd-012", "dimension": "board_meeting", "category": "no_match", "difficulty": "easy", "passed": true, "expected": "not_board", "actual": "not_board", "duration_ms": 0.0031, "root_cause": "none", "detail": "input='@skill:react_agent 查看ip' matched=False board_mode=False", "consistency": 1.0 }, { "task_id": "bd-013", "dimension": "board_meeting", "category": "name_validation", "difficulty": "medium", "passed": true, "expected": "2_valid", "actual": "2_valid", "duration_ms": 0.0103, "root_cause": "none", "detail": "input='@board:elon_musk,jeff_bezos 主题' experts=['elon_musk', 'jeff_bezos'] max=10", "consistency": 1.0 }, { "task_id": "bd-014", "dimension": "board_meeting", "category": "name_validation", "difficulty": "medium", "passed": true, "expected": "default_fallback", "actual": "default_fallback", "duration_ms": 0.2183, "root_cause": "none", "detail": "input='@board:@#$ 主题' experts=['elon_musk', 'jeff_bezos', 'allenzhang', 'charlie_munger', 'paul_graham'] max=10", "consistency": 1.0 }, { "task_id": "bd-015", "dimension": "board_meeting", "category": "name_validation", "difficulty": "medium", "passed": true, "expected": "10_capped", "actual": "10_capped", "duration_ms": 0.0168, "root_cause": "none", "detail": "input='@board:a,b,c,d,e,f,g,h,i,j,k 主题' experts=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] max=10", "consistency": 1.0 }, { "task_id": "bd-016", "dimension": "board_meeting", "category": "stop_command", "difficulty": "easy", "passed": true, "expected": "is_stop", "actual": "is_stop", "duration_ms": 1.3858, "root_cause": "none", "detail": "input='/stop' stop_commands=frozenset({'结束讨论', '停止讨论', 'stop', '/stop'})", "consistency": 1.0 }, { "task_id": "bd-017", "dimension": "board_meeting", "category": "stop_command", "difficulty": "easy", "passed": true, "expected": "is_stop", "actual": "is_stop", "duration_ms": 0.0102, "root_cause": "none", "detail": "input='停止讨论' stop_commands=frozenset({'结束讨论', '停止讨论', 'stop', '/stop'})", "consistency": 1.0 }, { "task_id": "bd-018", "dimension": "board_meeting", "category": "stop_command", "difficulty": "easy", "passed": true, "expected": "not_stop", "actual": "not_stop", "duration_ms": 0.0022, "root_cause": "none", "detail": "input='继续讨论' stop_commands=frozenset({'结束讨论', '停止讨论', 'stop', '/stop'})", "consistency": 1.0 } ] } } }