{ "timestamp": "2026-06-17T04:52:53.863927+00:00", "version": "0.1.0", "mode": "all", "runs": 1, "fast": false, "overall_accuracy": 0.9524, "overall_accuracy_mean": 0.9524, "overall_accuracy_std": 0.0, "summary": "60/63 tests passed (3 failed) across 9 dimensions.", "dimensions": { "preprocessing": { "metrics": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0128, "latency_p95_ms": 0.057, "latency_p99_ms": 0.1086, "consistency": 1.0, "total": 15, "passed": 15, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.7961, "ci_upper": 1.0 }, "by_category": { "greeting": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0133, "latency_p95_ms": 0.026, "latency_p99_ms": 0.0275, "consistency": 1.0, "total": 4, "passed": 4, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5101, "ci_upper": 1.0 }, "tool_query": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0115, "latency_p95_ms": 0.0166, "latency_p99_ms": 0.0172, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "skill_prefix": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0294, "latency_p95_ms": 0.1123, "latency_p99_ms": 0.1197, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 }, "complex": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0101, "latency_p95_ms": 0.0125, "latency_p99_ms": 0.0127, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0115, "latency_p95_ms": 0.0253, "latency_p99_ms": 0.0274, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "medium": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0136, "latency_p95_ms": 0.0263, "latency_p99_ms": 0.0288, "consistency": 1.0, "total": 7, "passed": 7, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.6457, "ci_upper": 1.0 }, "hard": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0128, "latency_p95_ms": 0.1106, "latency_p99_ms": 0.1193, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 } }, "cases": [ { "task_id": "prep-001", "dimension": "preprocessing", "category": "greeting", "difficulty": "easy", "passed": true, "expected": "direct_chat", "actual": "direct_chat", "duration_ms": 0.0279, "root_cause": "none", "detail": "input='你好' method=regex_direct", "consistency": 1.0 }, { "task_id": "prep-002", "dimension": "preprocessing", "category": "greeting", "difficulty": "easy", "passed": true, "expected": "direct_chat", "actual": "direct_chat", "duration_ms": 0.0151, "root_cause": "none", "detail": "input='hello' method=regex_direct", "consistency": 1.0 }, { "task_id": "prep-003", "dimension": "preprocessing", "category": "greeting", "difficulty": "easy", "passed": true, "expected": "direct_chat", "actual": "direct_chat", "duration_ms": 0.0111, "root_cause": "none", "detail": "input='谢谢' method=regex_direct", "consistency": 1.0 }, { "task_id": "prep-004", "dimension": "preprocessing", "category": "greeting", "difficulty": "easy", "passed": true, "expected": "direct_chat", "actual": "direct_chat", "duration_ms": 0.0115, "root_cause": "none", "detail": "input='你是谁' method=regex_direct", "consistency": 1.0 }, { "task_id": "prep-005", "dimension": "preprocessing", "category": "tool_query", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0136, "root_cause": "none", "detail": "input='搜索golang教程' method=default_react", "consistency": 1.0 }, { "task_id": "prep-006", "dimension": "preprocessing", "category": "tool_query", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0115, "root_cause": "none", "detail": "input='执行ls命令' method=default_react", "consistency": 1.0 }, { "task_id": "prep-007", "dimension": "preprocessing", "category": "tool_query", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0174, "root_cause": "none", "detail": "input='翻译hello为中文' method=default_react", "consistency": 1.0 }, { "task_id": "prep-008", "dimension": "preprocessing", "category": "tool_query", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0113, "root_cause": "none", "detail": "input='什么是机器学习' method=default_react", "consistency": 1.0 }, { "task_id": "prep-009", "dimension": "preprocessing", "category": "tool_query", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0109, "root_cause": "none", "detail": "input='帮我分析数据' method=default_react", "consistency": 1.0 }, { "task_id": "prep-010", "dimension": "preprocessing", "category": "skill_prefix", "difficulty": "medium", "passed": true, "expected": "skill_react", "actual": "skill_react", "duration_ms": 0.0294, "root_cause": "none", "detail": "input='@skill:react_agent 查看ip' method=skill_prefix", "consistency": 1.0 }, { "task_id": "prep-011", "dimension": "preprocessing", "category": "skill_prefix", "difficulty": "medium", "passed": true, "expected": "direct_chat", "actual": "direct_chat", "duration_ms": 0.0191, "root_cause": "none", "detail": "input='@skill:chat_only 你好' method=skill_prefix", "consistency": 1.0 }, { "task_id": "prep-012", "dimension": "preprocessing", "category": "skill_prefix", "difficulty": "hard", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.1215, "root_cause": "none", "detail": "input='@skill:nonexistent 做点什么' method=skill_not_found_fallback", "consistency": 1.0 }, { "task_id": "prep-013", "dimension": "preprocessing", "category": "complex", "difficulty": "hard", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0101, "root_cause": "none", "detail": "input='帮我分析这个数据并生成报告' method=default_react", "consistency": 1.0 }, { "task_id": "prep-014", "dimension": "preprocessing", "category": "complex", "difficulty": "easy", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0099, "root_cause": "none", "detail": "input='随便聊聊' method=default_react", "consistency": 1.0 }, { "task_id": "prep-015", "dimension": "preprocessing", "category": "complex", "difficulty": "hard", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0128, "root_cause": "none", "detail": "input='请帮我完成以下任务:1. 查询天气 2. 生成报告' method=default_react", "consistency": 1.0 } ] }, "overfitting": { "metrics": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.025, "latency_p95_ms": 0.0557, "latency_p99_ms": 0.0596, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "by_category": { "ip_check": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0362, "latency_p95_ms": 0.0362, "latency_p99_ms": 0.0362, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "search": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0243, "latency_p95_ms": 0.0243, "latency_p99_ms": 0.0243, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "greeting": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0606, "latency_p95_ms": 0.0606, "latency_p99_ms": 0.0606, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "tool_use": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0233, "latency_p95_ms": 0.0233, "latency_p99_ms": 0.0233, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "complex": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.025, "latency_p95_ms": 0.025, "latency_p99_ms": 0.025, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 } }, "by_difficulty": { "medium": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0243, "latency_p95_ms": 0.035, "latency_p99_ms": 0.036, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 }, "easy": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0606, "latency_p95_ms": 0.0606, "latency_p99_ms": 0.0606, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "hard": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.025, "latency_p95_ms": 0.025, "latency_p99_ms": 0.025, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 } }, "cases": [ { "task_id": "over-001", "dimension": "overfitting", "category": "ip_check", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0362, "root_cause": "none", "detail": "paraphrases=5 modes=['react', 'react', 'react', 'react', 'react']", "consistency": 1.0 }, { "task_id": "over-002", "dimension": "overfitting", "category": "search", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0243, "root_cause": "none", "detail": "paraphrases=3 modes=['react', 'react', 'react']", "consistency": 1.0 }, { "task_id": "over-003", "dimension": "overfitting", "category": "greeting", "difficulty": "easy", "passed": true, "expected": "direct_chat", "actual": "direct_chat", "duration_ms": 0.0606, "root_cause": "none", "detail": "paraphrases=5 modes=['direct_chat', 'direct_chat', 'direct_chat', 'direct_chat', 'direct_chat']", "consistency": 1.0 }, { "task_id": "over-004", "dimension": "overfitting", "category": "tool_use", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0233, "root_cause": "none", "detail": "paraphrases=3 modes=['react', 'react', 'react']", "consistency": 1.0 }, { "task_id": "over-005", "dimension": "overfitting", "category": "complex", "difficulty": "hard", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.025, "root_cause": "none", "detail": "paraphrases=3 modes=['react', 'react', 'react']", "consistency": 1.0 } ] }, "efficiency": { "metrics": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.33, "latency_p95_ms": 0.622, "latency_p99_ms": 0.6604, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "by_category": { "preprocess_latency": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.33, "latency_p95_ms": 0.42, "latency_p99_ms": 0.428, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 }, "tool_search_latency": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.355, "latency_p95_ms": 0.6385, "latency_p99_ms": 0.6637, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.165, "latency_p95_ms": 0.2775, "latency_p99_ms": 0.2875, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 }, "medium": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.43, "latency_p95_ms": 0.646, "latency_p99_ms": 0.6652, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 } }, "cases": [ { "task_id": "eff-001", "dimension": "efficiency", "category": "preprocess_latency", "difficulty": "easy", "passed": true, "expected": "<=50ms", "actual": "0.003ms", "duration_ms": 0.29, "root_cause": "none", "detail": "iterations=100 avg=0.003ms threshold=50.0ms", "consistency": 1.0 }, { "task_id": "eff-002", "dimension": "efficiency", "category": "preprocess_latency", "difficulty": "medium", "passed": true, "expected": "<=50ms", "actual": "0.003ms", "duration_ms": 0.33, "root_cause": "none", "detail": "iterations=100 avg=0.003ms threshold=50.0ms", "consistency": 1.0 }, { "task_id": "eff-003", "dimension": "efficiency", "category": "preprocess_latency", "difficulty": "medium", "passed": true, "expected": "<=50ms", "actual": "0.004ms", "duration_ms": 0.43, "root_cause": "none", "detail": "iterations=100 avg=0.004ms threshold=50.0ms", "consistency": 1.0 }, { "task_id": "eff-004", "dimension": "efficiency", "category": "tool_search_latency", "difficulty": "medium", "passed": true, "expected": "<=10ms", "actual": "0.007ms", "duration_ms": 0.67, "root_cause": "none", "detail": "iterations=100 avg=0.007ms threshold=10.0ms", "consistency": 1.0 }, { "task_id": "eff-005", "dimension": "efficiency", "category": "tool_search_latency", "difficulty": "easy", "passed": true, "expected": "<=5ms", "actual": "0.000ms", "duration_ms": 0.04, "root_cause": "none", "detail": "iterations=100 avg=0.000ms threshold=5.0ms", "consistency": 1.0 } ] }, "tool_search": { "metrics": { "accuracy": 1.0, "precision": 0.8333, "recall": 0.8333, "f1": 0.8333, "latency_p50_ms": 0.0192, "latency_p95_ms": 0.0278, "latency_p99_ms": 0.0326, "consistency": 1.0, "total": 10, "passed": 10, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.7225, "ci_upper": 1.0 }, "by_category": { "exact_match": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0199, "latency_p95_ms": 0.0203, "latency_p99_ms": 0.0204, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "fuzzy_match": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0264, "latency_p95_ms": 0.0331, "latency_p99_ms": 0.0337, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 }, "no_match": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.0118, "latency_p95_ms": 0.0122, "latency_p99_ms": 0.0123, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 }, "top_k": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.016, "latency_p95_ms": 0.016, "latency_p99_ms": 0.016, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 0.8333, "recall": 0.8333, "f1": 0.8333, "latency_p50_ms": 0.0194, "latency_p95_ms": 0.0203, "latency_p99_ms": 0.0204, "consistency": 1.0, "total": 7, "passed": 7, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.6457, "ci_upper": 1.0 }, "medium": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.019, "latency_p95_ms": 0.0323, "latency_p99_ms": 0.0335, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 } }, "cases": [ { "task_id": "ts-001", "dimension": "tool_search", "category": "exact_match", "difficulty": "easy", "passed": true, "expected": "read_file", "actual": "read_file", "duration_ms": 0.0199, "root_cause": "none", "detail": "query='read file' top_k=5 results=2", "consistency": 1.0 }, { "task_id": "ts-002", "dimension": "tool_search", "category": "exact_match", "difficulty": "easy", "passed": true, "expected": "write_file", "actual": "write_file", "duration_ms": 0.0204, "root_cause": "none", "detail": "query='write file content' top_k=5 results=2", "consistency": 1.0 }, { "task_id": "ts-003", "dimension": "tool_search", "category": "exact_match", "difficulty": "easy", "passed": true, "expected": "web_search", "actual": "web_search", "duration_ms": 0.02, "root_cause": "none", "detail": "query='search web information' top_k=5 results=2", "consistency": 1.0 }, { "task_id": "ts-004", "dimension": "tool_search", "category": "exact_match", "difficulty": "easy", "passed": true, "expected": "shell_exec", "actual": "shell_exec", "duration_ms": 0.018, "root_cause": "none", "detail": "query='execute shell command' top_k=5 results=1", "consistency": 1.0 }, { "task_id": "ts-005", "dimension": "tool_search", "category": "exact_match", "difficulty": "easy", "passed": true, "expected": "http_request", "actual": "http_request", "duration_ms": 0.0194, "root_cause": "none", "detail": "query='send http request url' top_k=5 results=1", "consistency": 1.0 }, { "task_id": "ts-006", "dimension": "tool_search", "category": "fuzzy_match", "difficulty": "medium", "passed": true, "expected": "read_file", "actual": "read_file", "duration_ms": 0.0338, "root_cause": "none", "detail": "query='io file' top_k=5 results=2", "consistency": 1.0 }, { "task_id": "ts-007", "dimension": "tool_search", "category": "fuzzy_match", "difficulty": "medium", "passed": true, "expected": "web_search", "actual": "web_search", "duration_ms": 0.019, "root_cause": "none", "detail": "query='search query engine' top_k=5 results=1", "consistency": 1.0 }, { "task_id": "ts-008", "dimension": "tool_search", "category": "no_match", "difficulty": "easy", "passed": true, "expected": "__none__", "actual": "[]", "duration_ms": 0.0112, "root_cause": "none", "detail": "query='' top_k=5 results=0", "consistency": 1.0 }, { "task_id": "ts-009", "dimension": "tool_search", "category": "no_match", "difficulty": "easy", "passed": true, "expected": "__none__", "actual": "[]", "duration_ms": 0.0123, "root_cause": "none", "detail": "query='zzzznonexistent' top_k=5 results=0", "consistency": 1.0 }, { "task_id": "ts-010", "dimension": "tool_search", "category": "top_k", "difficulty": "medium", "passed": true, "expected": "read_file", "actual": "read_file", "duration_ms": 0.016, "root_cause": "none", "detail": "query='file' top_k=1 results=1", "consistency": 1.0 } ] }, "event_model": { "metrics": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.057, "latency_p95_ms": 15.9984, "latency_p99_ms": 20.2369, "consistency": 1.0, "total": 6, "passed": 6, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.6097, "ci_upper": 1.0 }, "by_category": { "sq_lifecycle": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.046, "latency_p95_ms": 0.0982, "latency_p99_ms": 0.1028, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 }, "eq_lifecycle": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.0681, "latency_p95_ms": 19.1737, "latency_p99_ms": 20.8719, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.057, "latency_p95_ms": 15.9984, "latency_p99_ms": 20.2369, "consistency": 1.0, "total": 6, "passed": 6, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.6097, "ci_upper": 1.0 } }, "cases": [ { "task_id": "ev-001", "dimension": "event_model", "category": "sq_lifecycle", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "drained=['hello']", "duration_ms": 0.104, "root_cause": "none", "detail": "task_id=09dccea9...", "consistency": 1.0 }, { "task_id": "ev-002", "dimension": "event_model", "category": "sq_lifecycle", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "cancelled=True", "duration_ms": 0.046, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "ev-003", "dimension": "event_model", "category": "sq_lifecycle", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "raised=True closed=True", "duration_ms": 0.0115, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "ev-004", "dimension": "event_model", "category": "eq_lifecycle", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "received=1", "duration_ms": 0.0681, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "ev-005", "dimension": "event_model", "category": "eq_lifecycle", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "events=1 closed=True", "duration_ms": 21.2965, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "ev-006", "dimension": "event_model", "category": "eq_lifecycle", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "subscribers=0", "duration_ms": 0.007, "root_cause": "none", "detail": "", "consistency": 1.0 } ] }, "spec_management": { "metrics": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 1.3834, "latency_p95_ms": 3.4578, "latency_p99_ms": 4.0077, "consistency": 1.0, "total": 7, "passed": 7, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.6457, "ci_upper": 1.0 }, "by_category": { "crud": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 1.3834, "latency_p95_ms": 3.6044, "latency_p99_ms": 4.037, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "edge": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.9497, "latency_p95_ms": 1.7635, "latency_p99_ms": 1.8358, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 1.3659, "latency_p95_ms": 3.4693, "latency_p99_ms": 4.01, "consistency": 1.0, "total": 6, "passed": 6, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.6097, "ci_upper": 1.0 }, "medium": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 1.8539, "latency_p95_ms": 1.8539, "latency_p99_ms": 1.8539, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 } }, "cases": [ { "task_id": "sm-001", "dimension": "spec_management", "category": "crud", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "exists=True", "duration_ms": 1.3484, "root_cause": "none", "detail": "path=/var/folders/6b/ljk5bdq50yxcsth24frf05200000gn/T/agentkit-benchmark-wll_nqgl/run-0/specs/sm-001/test-spec.yaml", "consistency": 1.0 }, { "task_id": "sm-002", "dimension": "spec_management", "category": "crud", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "steps=2", "duration_ms": 1.3834, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "sm-003", "dimension": "spec_management", "category": "crud", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "goal=Updated goal", "duration_ms": 1.4414, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "sm-004", "dimension": "spec_management", "category": "crud", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "deleted=True remaining=0", "duration_ms": 1.0766, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "sm-005", "dimension": "spec_management", "category": "crud", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "count=2", "duration_ms": 4.1452, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "sm-006", "dimension": "spec_management", "category": "edge", "difficulty": "medium", "passed": true, "expected": "passed", "actual": "status=confirmed", "duration_ms": 1.8539, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "sm-007", "dimension": "spec_management", "category": "edge", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "result=None", "duration_ms": 0.0454, "root_cause": "none", "detail": "", "consistency": 1.0 } ] }, "verification": { "metrics": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 22.0041, "latency_p95_ms": 411.5705, "latency_p99_ms": 487.0649, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "by_category": { "basic": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 11.4916, "latency_p95_ms": 11.8303, "latency_p99_ms": 11.8604, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 }, "retry": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 34.0985, "latency_p95_ms": 34.0985, "latency_p99_ms": 34.0985, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "timeout": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 505.9385, "latency_p95_ms": 505.9385, "latency_p99_ms": 505.9385, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "multi": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 22.0041, "latency_p95_ms": 22.0041, "latency_p99_ms": 22.0041, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 11.4916, "latency_p95_ms": 11.8303, "latency_p99_ms": 11.8604, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 }, "medium": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 34.0985, "latency_p95_ms": 458.7545, "latency_p99_ms": 496.5017, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 } }, "cases": [ { "task_id": "vf-001", "dimension": "verification", "category": "basic", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "passed=True attempts=1", "duration_ms": 11.8679, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "vf-002", "dimension": "verification", "category": "basic", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "passed=False errors=1", "duration_ms": 11.1154, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "vf-003", "dimension": "verification", "category": "retry", "difficulty": "medium", "passed": true, "expected": "passed", "actual": "attempts=3 callbacks=2", "duration_ms": 34.0985, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "vf-004", "dimension": "verification", "category": "timeout", "difficulty": "medium", "passed": true, "expected": "passed", "actual": "passed=False errors=1", "duration_ms": 505.9385, "root_cause": "none", "detail": "errors=['Command timed out after 0.5s: sleep 10']", "consistency": 1.0 }, { "task_id": "vf-005", "dimension": "verification", "category": "multi", "difficulty": "medium", "passed": true, "expected": "passed", "actual": "passed=False", "duration_ms": 22.0041, "root_cause": "none", "detail": "", "consistency": 1.0 } ] }, "llm_reasoning": { "metrics": { "accuracy": 0.6, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 25149.4865, "latency_p95_ms": 30001.1677, "latency_p99_ms": 30001.2291, "consistency": 1.0, "total": 5, "passed": 3, "failed": 2, "accuracy_mean": 0.6, "accuracy_std": 0.0, "ci_lower": 0.2307, "ci_upper": 0.8824 }, "by_category": { "intent_understanding": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 21288.4177, "latency_p95_ms": 21288.4177, "latency_p99_ms": 21288.4177, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "tool_selection": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 5894.9682, "latency_p95_ms": 5894.9682, "latency_p99_ms": 5894.9682, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "multi_step": { "accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 30000.8609, "latency_p95_ms": 30000.8609, "latency_p99_ms": 30000.8609, "consistency": 1.0, "total": 1, "passed": 0, "failed": 1, "accuracy_mean": 0.0, "accuracy_std": 0.0, "ci_lower": 0.0, "ci_upper": 0.7935 }, "code_generation": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 25149.4865, "latency_p95_ms": 25149.4865, "latency_p99_ms": 25149.4865, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "error_recovery": { "accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 30001.2444, "latency_p95_ms": 30001.2444, "latency_p99_ms": 30001.2444, "consistency": 1.0, "total": 1, "passed": 0, "failed": 1, "accuracy_mean": 0.0, "accuracy_std": 0.0, "ci_lower": 0.0, "ci_upper": 0.7935 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 21288.4177, "latency_p95_ms": 21288.4177, "latency_p99_ms": 21288.4177, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "medium": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 15522.2273, "latency_p95_ms": 24186.7606, "latency_p99_ms": 24956.9413, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 }, "hard": { "accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 30001.0526, "latency_p95_ms": 30001.2252, "latency_p99_ms": 30001.2406, "consistency": 1.0, "total": 2, "passed": 0, "failed": 2, "accuracy_mean": 0.0, "accuracy_std": 0.0, "ci_lower": 0.0, "ci_upper": 0.6576 } }, "cases": [ { "task_id": "llm-001", "dimension": "llm_reasoning", "category": "intent_understanding", "difficulty": "easy", "passed": true, "expected": "react", "actual": "mode=react tokens=1116 len=974", "duration_ms": 21288.4177, "root_cause": "none", "detail": "mode=react keywords=['ip', '地址', 'ifconfig', 'hostname', '网络']", "consistency": 1.0 }, { "task_id": "llm-002", "dimension": "llm_reasoning", "category": "tool_selection", "difficulty": "medium", "passed": true, "expected": "react", "actual": "mode=react tokens=205 len=87", "duration_ms": 5894.9682, "root_cause": "none", "detail": "mode=react keywords=['search', '搜索', 'web', '论文', 'paper', 'agent']", "consistency": 1.0 }, { "task_id": "llm-003", "dimension": "llm_reasoning", "category": "multi_step", "difficulty": "hard", "passed": false, "expected": "react", "actual": "timeout", "duration_ms": 30000.8609, "root_cause": "timeout", "detail": "LLM call timed out after 30s", "consistency": 1.0 }, { "task_id": "llm-004", "dimension": "llm_reasoning", "category": "code_generation", "difficulty": "medium", "passed": true, "expected": "react", "actual": "mode=react tokens=1359 len=1001", "duration_ms": 25149.4865, "root_cause": "none", "detail": "mode=react keywords=['def', 'fib', 'return', 'python']", "consistency": 1.0 }, { "task_id": "llm-005", "dimension": "llm_reasoning", "category": "error_recovery", "difficulty": "hard", "passed": false, "expected": "react", "actual": "timeout", "duration_ms": 30001.2444, "root_cause": "timeout", "detail": "LLM call timed out after 30s", "consistency": 1.0 } ] }, "gui_integration": { "metrics": { "accuracy": 0.8, "precision": 0.8, "recall": 0.8, "f1": 0.8, "latency_p50_ms": 0.0, "latency_p95_ms": 0.0, "latency_p99_ms": 0.0, "consistency": 1.0, "total": 5, "passed": 4, "failed": 1, "accuracy_mean": 0.8, "accuracy_std": 0.0, "ci_lower": 0.3755, "ci_upper": 0.9638 }, "by_category": { "service_startup": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0, "latency_p95_ms": 0.0, "latency_p99_ms": 0.0, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "api_availability": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0, "latency_p95_ms": 0.0, "latency_p99_ms": 0.0, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 }, "websocket": { "accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.0, "latency_p95_ms": 0.0, "latency_p99_ms": 0.0, "consistency": 1.0, "total": 1, "passed": 0, "failed": 1, "accuracy_mean": 0.0, "accuracy_std": 0.0, "ci_lower": 0.0, "ci_upper": 0.7935 }, "frontend": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0, "latency_p95_ms": 0.0, "latency_p99_ms": 0.0, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0, "latency_p95_ms": 0.0, "latency_p99_ms": 0.0, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 }, "medium": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0, "latency_p95_ms": 0.0, "latency_p99_ms": 0.0, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 }, "hard": { "accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.0, "latency_p95_ms": 0.0, "latency_p99_ms": 0.0, "consistency": 1.0, "total": 1, "passed": 0, "failed": 1, "accuracy_mean": 0.0, "accuracy_std": 0.0, "ci_lower": 0.0, "ci_upper": 0.7935 } }, "cases": [ { "task_id": "gui-001", "dimension": "gui_integration", "category": "service_startup", "difficulty": "easy", "passed": true, "expected": "started", "actual": "started", "duration_ms": 0.0, "root_cause": "none", "detail": "port=64767 pid=20993", "consistency": 1.0 }, { "task_id": "gui-002", "dimension": "gui_integration", "category": "api_availability", "difficulty": "medium", "passed": true, "expected": "200", "actual": "200", "duration_ms": 0.0, "root_cause": "none", "detail": "health=200 skills=200", "consistency": 1.0 }, { "task_id": "gui-003", "dimension": "gui_integration", "category": "api_availability", "difficulty": "medium", "passed": true, "expected": "reachable", "actual": "reachable", "duration_ms": 0.0, "root_cause": "none", "detail": "status=405", "consistency": 1.0 }, { "task_id": "gui-004", "dimension": "gui_integration", "category": "websocket", "difficulty": "hard", "passed": false, "expected": "connected", "actual": "failed", "duration_ms": 0.0, "root_cause": "gui_failure", "detail": "error: server rejected WebSocket connection: HTTP 403", "consistency": 1.0 }, { "task_id": "gui-005", "dimension": "gui_integration", "category": "frontend", "difficulty": "easy", "passed": true, "expected": "html", "actual": "html", "duration_ms": 0.0, "root_cause": "none", "detail": "status=200 len=465", "consistency": 1.0 } ] } }, "baseline_comparison": { "status": "compared", "dimensions": { "preprocessing": { "baseline_accuracy": 1.0, "current_accuracy": 1.0, "change": 0.0, "direction": "—" }, "overfitting": { "baseline_accuracy": 1.0, "current_accuracy": 1.0, "change": 0.0, "direction": "—" }, "efficiency": { "baseline_accuracy": 1.0, "current_accuracy": 1.0, "change": 0.0, "direction": "—" }, "tool_search": { "baseline_accuracy": 1.0, "current_accuracy": 1.0, "change": 0.0, "direction": "—" }, "event_model": { "baseline_accuracy": 1.0, "current_accuracy": 1.0, "change": 0.0, "direction": "—" }, "spec_management": { "baseline_accuracy": 1.0, "current_accuracy": 1.0, "change": 0.0, "direction": "—" }, "verification": { "baseline_accuracy": 1.0, "current_accuracy": 1.0, "change": 0.0, "direction": "—" }, "llm_reasoning": { "baseline_accuracy": 0.0, "current_accuracy": 0.6, "change": 0.6, "direction": "↑" }, "gui_integration": { "baseline_accuracy": 0.0, "current_accuracy": 0.8, "change": 0.8, "direction": "↑" } } } }