{ "timestamp": "2026-06-17T03:54:43.123142+00:00", "version": "0.1.0", "runs": 1, "fast": false, "overall_accuracy": 1.0, "overall_accuracy_mean": 1.0, "overall_accuracy_std": 0.0, "summary": "All 53 tests passed across 7 dimensions.", "dimensions": { "preprocessing": { "metrics": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.016, "latency_p95_ms": 0.4208, "latency_p99_ms": 1.1294, "consistency": 1.0, "total": 15, "passed": 15, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.7961, "ci_upper": 1.0 }, "by_category": { "greeting": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0196, "latency_p95_ms": 0.0241, "latency_p99_ms": 0.0243, "consistency": 1.0, "total": 4, "passed": 4, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5101, "ci_upper": 1.0 }, "tool_query": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0153, "latency_p95_ms": 0.0162, "latency_p99_ms": 0.0164, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "skill_prefix": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0412, "latency_p95_ms": 1.1801, "latency_p99_ms": 1.2813, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 }, "complex": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0147, "latency_p95_ms": 0.0148, "latency_p99_ms": 0.0148, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.017, "latency_p95_ms": 0.0239, "latency_p99_ms": 0.0243, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "medium": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0156, "latency_p95_ms": 0.0367, "latency_p99_ms": 0.0403, "consistency": 1.0, "total": 7, "passed": 7, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.6457, "ci_upper": 1.0 }, "hard": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0147, "latency_p95_ms": 1.1774, "latency_p99_ms": 1.2808, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 } }, "cases": [ { "task_id": "prep-001", "dimension": "preprocessing", "category": "greeting", "difficulty": "easy", "passed": true, "expected": "direct_chat", "actual": "direct_chat", "duration_ms": 0.0221, "root_cause": "none", "detail": "input='你好' method=regex_direct", "consistency": 1.0 }, { "task_id": "prep-002", "dimension": "preprocessing", "category": "greeting", "difficulty": "easy", "passed": true, "expected": "direct_chat", "actual": "direct_chat", "duration_ms": 0.0244, "root_cause": "none", "detail": "input='hello' method=regex_direct", "consistency": 1.0 }, { "task_id": "prep-003", "dimension": "preprocessing", "category": "greeting", "difficulty": "easy", "passed": true, "expected": "direct_chat", "actual": "direct_chat", "duration_ms": 0.017, "root_cause": "none", "detail": "input='谢谢' method=regex_direct", "consistency": 1.0 }, { "task_id": "prep-004", "dimension": "preprocessing", "category": "greeting", "difficulty": "easy", "passed": true, "expected": "direct_chat", "actual": "direct_chat", "duration_ms": 0.016, "root_cause": "none", "detail": "input='你是谁' method=regex_direct", "consistency": 1.0 }, { "task_id": "prep-005", "dimension": "preprocessing", "category": "tool_query", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0164, "root_cause": "none", "detail": "input='搜索golang教程' method=default_react", "consistency": 1.0 }, { "task_id": "prep-006", "dimension": "preprocessing", "category": "tool_query", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0156, "root_cause": "none", "detail": "input='执行ls命令' method=default_react", "consistency": 1.0 }, { "task_id": "prep-007", "dimension": "preprocessing", "category": "tool_query", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0153, "root_cause": "none", "detail": "input='翻译hello为中文' method=default_react", "consistency": 1.0 }, { "task_id": "prep-008", "dimension": "preprocessing", "category": "tool_query", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.014, "root_cause": "none", "detail": "input='什么是机器学习' method=default_react", "consistency": 1.0 }, { "task_id": "prep-009", "dimension": "preprocessing", "category": "tool_query", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0148, "root_cause": "none", "detail": "input='帮我分析数据' method=default_react", "consistency": 1.0 }, { "task_id": "prep-010", "dimension": "preprocessing", "category": "skill_prefix", "difficulty": "medium", "passed": true, "expected": "skill_react", "actual": "skill_react", "duration_ms": 0.0412, "root_cause": "none", "detail": "input='@skill:react_agent 查看ip' method=skill_prefix", "consistency": 1.0 }, { "task_id": "prep-011", "dimension": "preprocessing", "category": "skill_prefix", "difficulty": "medium", "passed": true, "expected": "direct_chat", "actual": "direct_chat", "duration_ms": 0.0262, "root_cause": "none", "detail": "input='@skill:chat_only 你好' method=skill_prefix", "consistency": 1.0 }, { "task_id": "prep-012", "dimension": "preprocessing", "category": "skill_prefix", "difficulty": "hard", "passed": true, "expected": "react", "actual": "react", "duration_ms": 1.3066, "root_cause": "none", "detail": "input='@skill:nonexistent 做点什么' method=skill_not_found_fallback", "consistency": 1.0 }, { "task_id": "prep-013", "dimension": "preprocessing", "category": "complex", "difficulty": "hard", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0147, "root_cause": "none", "detail": "input='帮我分析这个数据并生成报告' method=default_react", "consistency": 1.0 }, { "task_id": "prep-014", "dimension": "preprocessing", "category": "complex", "difficulty": "easy", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0148, "root_cause": "none", "detail": "input='随便聊聊' method=default_react", "consistency": 1.0 }, { "task_id": "prep-015", "dimension": "preprocessing", "category": "complex", "difficulty": "hard", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0132, "root_cause": "none", "detail": "input='请帮我完成以下任务:1. 查询天气 2. 生成报告' method=default_react", "consistency": 1.0 } ] }, "overfitting": { "metrics": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0295, "latency_p95_ms": 0.0396, "latency_p99_ms": 0.0401, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "by_category": { "ip_check": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0402, "latency_p95_ms": 0.0402, "latency_p99_ms": 0.0402, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "search": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0282, "latency_p95_ms": 0.0282, "latency_p99_ms": 0.0282, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "greeting": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0373, "latency_p95_ms": 0.0373, "latency_p99_ms": 0.0373, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "tool_use": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0295, "latency_p95_ms": 0.0295, "latency_p99_ms": 0.0295, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "complex": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0249, "latency_p95_ms": 0.0249, "latency_p99_ms": 0.0249, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 } }, "by_difficulty": { "medium": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0295, "latency_p95_ms": 0.0391, "latency_p99_ms": 0.04, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 }, "easy": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0373, "latency_p95_ms": 0.0373, "latency_p99_ms": 0.0373, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "hard": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0249, "latency_p95_ms": 0.0249, "latency_p99_ms": 0.0249, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 } }, "cases": [ { "task_id": "over-001", "dimension": "overfitting", "category": "ip_check", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0402, "root_cause": "none", "detail": "paraphrases=5 modes=['react', 'react', 'react', 'react', 'react']", "consistency": 1.0 }, { "task_id": "over-002", "dimension": "overfitting", "category": "search", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0282, "root_cause": "none", "detail": "paraphrases=3 modes=['react', 'react', 'react']", "consistency": 1.0 }, { "task_id": "over-003", "dimension": "overfitting", "category": "greeting", "difficulty": "easy", "passed": true, "expected": "direct_chat", "actual": "direct_chat", "duration_ms": 0.0373, "root_cause": "none", "detail": "paraphrases=5 modes=['direct_chat', 'direct_chat', 'direct_chat', 'direct_chat', 'direct_chat']", "consistency": 1.0 }, { "task_id": "over-004", "dimension": "overfitting", "category": "tool_use", "difficulty": "medium", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0295, "root_cause": "none", "detail": "paraphrases=3 modes=['react', 'react', 'react']", "consistency": 1.0 }, { "task_id": "over-005", "dimension": "overfitting", "category": "complex", "difficulty": "hard", "passed": true, "expected": "react", "actual": "react", "duration_ms": 0.0249, "root_cause": "none", "detail": "paraphrases=3 modes=['react', 'react', 'react']", "consistency": 1.0 } ] }, "efficiency": { "metrics": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.33, "latency_p95_ms": 0.602, "latency_p99_ms": 0.6404, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "by_category": { "preprocess_latency": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.33, "latency_p95_ms": 0.402, "latency_p99_ms": 0.4084, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 }, "tool_search_latency": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.345, "latency_p95_ms": 0.6195, "latency_p99_ms": 0.6439, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.16, "latency_p95_ms": 0.268, "latency_p99_ms": 0.2776, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 }, "medium": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.41, "latency_p95_ms": 0.626, "latency_p99_ms": 0.6452, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 } }, "cases": [ { "task_id": "eff-001", "dimension": "efficiency", "category": "preprocess_latency", "difficulty": "easy", "passed": true, "expected": "<=50ms", "actual": "0.003ms", "duration_ms": 0.28, "root_cause": "none", "detail": "iterations=100 avg=0.003ms threshold=50.0ms", "consistency": 1.0 }, { "task_id": "eff-002", "dimension": "efficiency", "category": "preprocess_latency", "difficulty": "medium", "passed": true, "expected": "<=50ms", "actual": "0.003ms", "duration_ms": 0.33, "root_cause": "none", "detail": "iterations=100 avg=0.003ms threshold=50.0ms", "consistency": 1.0 }, { "task_id": "eff-003", "dimension": "efficiency", "category": "preprocess_latency", "difficulty": "medium", "passed": true, "expected": "<=50ms", "actual": "0.004ms", "duration_ms": 0.41, "root_cause": "none", "detail": "iterations=100 avg=0.004ms threshold=50.0ms", "consistency": 1.0 }, { "task_id": "eff-004", "dimension": "efficiency", "category": "tool_search_latency", "difficulty": "medium", "passed": true, "expected": "<=10ms", "actual": "0.006ms", "duration_ms": 0.65, "root_cause": "none", "detail": "iterations=100 avg=0.006ms threshold=10.0ms", "consistency": 1.0 }, { "task_id": "eff-005", "dimension": "efficiency", "category": "tool_search_latency", "difficulty": "easy", "passed": true, "expected": "<=5ms", "actual": "0.000ms", "duration_ms": 0.04, "root_cause": "none", "detail": "iterations=100 avg=0.000ms threshold=5.0ms", "consistency": 1.0 } ] }, "tool_search": { "metrics": { "accuracy": 1.0, "precision": 0.8333, "recall": 0.8333, "f1": 0.8333, "latency_p50_ms": 0.0229, "latency_p95_ms": 0.0415, "latency_p99_ms": 0.0518, "consistency": 1.0, "total": 10, "passed": 10, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.7225, "ci_upper": 1.0 }, "by_category": { "exact_match": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0234, "latency_p95_ms": 0.0487, "latency_p99_ms": 0.0533, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "fuzzy_match": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0224, "latency_p95_ms": 0.0228, "latency_p99_ms": 0.0228, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 }, "no_match": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.0089, "latency_p95_ms": 0.0141, "latency_p99_ms": 0.0146, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 }, "top_k": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0184, "latency_p95_ms": 0.0184, "latency_p99_ms": 0.0184, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 0.8333, "recall": 0.8333, "f1": 0.8333, "latency_p50_ms": 0.0231, "latency_p95_ms": 0.0458, "latency_p99_ms": 0.0527, "consistency": 1.0, "total": 7, "passed": 7, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.6457, "ci_upper": 1.0 }, "medium": { "accuracy": 1.0, "precision": 1.0, "recall": 1.0, "f1": 1.0, "latency_p50_ms": 0.0219, "latency_p95_ms": 0.0227, "latency_p99_ms": 0.0228, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 } }, "cases": [ { "task_id": "ts-001", "dimension": "tool_search", "category": "exact_match", "difficulty": "easy", "passed": true, "expected": "read_file", "actual": "read_file", "duration_ms": 0.023, "root_cause": "none", "detail": "query='read file' top_k=5 results=2", "consistency": 1.0 }, { "task_id": "ts-002", "dimension": "tool_search", "category": "exact_match", "difficulty": "easy", "passed": true, "expected": "write_file", "actual": "write_file", "duration_ms": 0.0544, "root_cause": "none", "detail": "query='write file content' top_k=5 results=2", "consistency": 1.0 }, { "task_id": "ts-003", "dimension": "tool_search", "category": "exact_match", "difficulty": "easy", "passed": true, "expected": "web_search", "actual": "web_search", "duration_ms": 0.0258, "root_cause": "none", "detail": "query='search web information' top_k=5 results=2", "consistency": 1.0 }, { "task_id": "ts-004", "dimension": "tool_search", "category": "exact_match", "difficulty": "easy", "passed": true, "expected": "shell_exec", "actual": "shell_exec", "duration_ms": 0.0234, "root_cause": "none", "detail": "query='execute shell command' top_k=5 results=1", "consistency": 1.0 }, { "task_id": "ts-005", "dimension": "tool_search", "category": "exact_match", "difficulty": "easy", "passed": true, "expected": "http_request", "actual": "http_request", "duration_ms": 0.0231, "root_cause": "none", "detail": "query='send http request url' top_k=5 results=1", "consistency": 1.0 }, { "task_id": "ts-006", "dimension": "tool_search", "category": "fuzzy_match", "difficulty": "medium", "passed": true, "expected": "read_file", "actual": "read_file", "duration_ms": 0.0228, "root_cause": "none", "detail": "query='io file' top_k=5 results=2", "consistency": 1.0 }, { "task_id": "ts-007", "dimension": "tool_search", "category": "fuzzy_match", "difficulty": "medium", "passed": true, "expected": "web_search", "actual": "web_search", "duration_ms": 0.0219, "root_cause": "none", "detail": "query='search query engine' top_k=5 results=1", "consistency": 1.0 }, { "task_id": "ts-008", "dimension": "tool_search", "category": "no_match", "difficulty": "easy", "passed": true, "expected": "__none__", "actual": "[]", "duration_ms": 0.003, "root_cause": "none", "detail": "query='' top_k=5 results=0", "consistency": 1.0 }, { "task_id": "ts-009", "dimension": "tool_search", "category": "no_match", "difficulty": "easy", "passed": true, "expected": "__none__", "actual": "[]", "duration_ms": 0.0147, "root_cause": "none", "detail": "query='zzzznonexistent' top_k=5 results=0", "consistency": 1.0 }, { "task_id": "ts-010", "dimension": "tool_search", "category": "top_k", "difficulty": "medium", "passed": true, "expected": "read_file", "actual": "read_file", "duration_ms": 0.0184, "root_cause": "none", "detail": "query='file' top_k=1 results=1", "consistency": 1.0 } ] }, "event_model": { "metrics": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.0894, "latency_p95_ms": 16.7933, "latency_p99_ms": 20.5773, "consistency": 1.0, "total": 6, "passed": 6, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.6097, "ci_upper": 1.0 }, "by_category": { "sq_lifecycle": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.0671, "latency_p95_ms": 0.1071, "latency_p99_ms": 0.1107, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 }, "eq_lifecycle": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 2.6035, "latency_p95_ms": 19.6313, "latency_p99_ms": 21.1449, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.0894, "latency_p95_ms": 16.7933, "latency_p99_ms": 20.5773, "consistency": 1.0, "total": 6, "passed": 6, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.6097, "ci_upper": 1.0 } }, "cases": [ { "task_id": "ev-001", "dimension": "event_model", "category": "sq_lifecycle", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "drained=['hello']", "duration_ms": 0.1116, "root_cause": "none", "detail": "task_id=5c4be886...", "consistency": 1.0 }, { "task_id": "ev-002", "dimension": "event_model", "category": "sq_lifecycle", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "cancelled=True", "duration_ms": 0.0671, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "ev-003", "dimension": "event_model", "category": "sq_lifecycle", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "raised=True closed=True", "duration_ms": 0.0143, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "ev-004", "dimension": "event_model", "category": "eq_lifecycle", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "received=1", "duration_ms": 2.6035, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "ev-005", "dimension": "event_model", "category": "eq_lifecycle", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "events=1 closed=True", "duration_ms": 21.5233, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "ev-006", "dimension": "event_model", "category": "eq_lifecycle", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "subscribers=0", "duration_ms": 0.008, "root_cause": "none", "detail": "", "consistency": 1.0 } ] }, "spec_management": { "metrics": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 1.4329, "latency_p95_ms": 2.75, "latency_p99_ms": 3.1046, "consistency": 1.0, "total": 7, "passed": 7, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.6457, "ci_upper": 1.0 }, "by_category": { "crud": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 1.4329, "latency_p95_ms": 2.8609, "latency_p99_ms": 3.1268, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "edge": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 0.8834, "latency_p95_ms": 1.6324, "latency_p99_ms": 1.699, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 1.3287, "latency_p95_ms": 2.7777, "latency_p99_ms": 3.1102, "consistency": 1.0, "total": 6, "passed": 6, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.6097, "ci_upper": 1.0 }, "medium": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 1.7156, "latency_p95_ms": 1.7156, "latency_p99_ms": 1.7156, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 } }, "cases": [ { "task_id": "sm-001", "dimension": "spec_management", "category": "crud", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "exists=True", "duration_ms": 1.4329, "root_cause": "none", "detail": "path=/var/folders/6b/ljk5bdq50yxcsth24frf05200000gn/T/agentkit-benchmark-dzm9kg48/run-0/specs/sm-001/test-spec.yaml", "consistency": 1.0 }, { "task_id": "sm-002", "dimension": "spec_management", "category": "crud", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "steps=2", "duration_ms": 1.2244, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "sm-003", "dimension": "spec_management", "category": "crud", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "goal=Updated goal", "duration_ms": 1.5311, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "sm-004", "dimension": "spec_management", "category": "crud", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "deleted=True remaining=0", "duration_ms": 1.1484, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "sm-005", "dimension": "spec_management", "category": "crud", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "count=2", "duration_ms": 3.1933, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "sm-006", "dimension": "spec_management", "category": "edge", "difficulty": "medium", "passed": true, "expected": "passed", "actual": "status=confirmed", "duration_ms": 1.7156, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "sm-007", "dimension": "spec_management", "category": "edge", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "result=None", "duration_ms": 0.0512, "root_cause": "none", "detail": "", "consistency": 1.0 } ] }, "verification": { "metrics": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 24.8909, "latency_p95_ms": 411.9118, "latency_p99_ms": 487.0974, "consistency": 1.0, "total": 5, "passed": 5, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.5655, "ci_upper": 1.0 }, "by_category": { "basic": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 11.7309, "latency_p95_ms": 11.9356, "latency_p99_ms": 11.9538, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 }, "retry": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 35.984, "latency_p95_ms": 35.984, "latency_p99_ms": 35.984, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "timeout": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 505.8938, "latency_p95_ms": 505.8938, "latency_p99_ms": 505.8938, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "multi": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 24.8909, "latency_p95_ms": 24.8909, "latency_p99_ms": 24.8909, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 11.7309, "latency_p95_ms": 11.9356, "latency_p99_ms": 11.9538, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 }, "medium": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 35.984, "latency_p95_ms": 458.9028, "latency_p99_ms": 496.4956, "consistency": 1.0, "total": 3, "passed": 3, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.4385, "ci_upper": 1.0 } }, "cases": [ { "task_id": "vf-001", "dimension": "verification", "category": "basic", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "passed=True attempts=1", "duration_ms": 11.5036, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "vf-002", "dimension": "verification", "category": "basic", "difficulty": "easy", "passed": true, "expected": "passed", "actual": "passed=False errors=1", "duration_ms": 11.9583, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "vf-003", "dimension": "verification", "category": "retry", "difficulty": "medium", "passed": true, "expected": "passed", "actual": "attempts=3 callbacks=2", "duration_ms": 35.984, "root_cause": "none", "detail": "", "consistency": 1.0 }, { "task_id": "vf-004", "dimension": "verification", "category": "timeout", "difficulty": "medium", "passed": true, "expected": "passed", "actual": "passed=False errors=1", "duration_ms": 505.8938, "root_cause": "none", "detail": "errors=['Command timed out after 0.5s: sleep 10']", "consistency": 1.0 }, { "task_id": "vf-005", "dimension": "verification", "category": "multi", "difficulty": "medium", "passed": true, "expected": "passed", "actual": "passed=False", "duration_ms": 24.8909, "root_cause": "none", "detail": "", "consistency": 1.0 } ] } } }