fischer-agentkit/test-results/benchmark/benchmark_report.json

1569 lines
43 KiB
JSON
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"timestamp": "2026-06-17T04:00:50.738066+00:00",
"version": "0.1.0",
"runs": 3,
"fast": false,
"overall_accuracy": 1.0,
"overall_accuracy_mean": 1.0,
"overall_accuracy_std": 0.0,
"summary": "All 53 tests passed across 7 dimensions.",
"dimensions": {
"preprocessing": {
"metrics": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.006,
"latency_p95_ms": 0.0295,
"latency_p99_ms": 0.0569,
"consistency": 1.0,
"total": 15,
"passed": 15,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.7961,
"ci_upper": 1.0
},
"by_category": {
"greeting": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.0069,
"latency_p95_ms": 0.0111,
"latency_p99_ms": 0.0117,
"consistency": 1.0,
"total": 4,
"passed": 4,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.5101,
"ci_upper": 1.0
},
"tool_query": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.0051,
"latency_p95_ms": 0.0052,
"latency_p99_ms": 0.0052,
"consistency": 1.0,
"total": 5,
"passed": 5,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.5655,
"ci_upper": 1.0
},
"skill_prefix": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.0149,
"latency_p95_ms": 0.0588,
"latency_p99_ms": 0.0627,
"consistency": 1.0,
"total": 3,
"passed": 3,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.4385,
"ci_upper": 1.0
},
"complex": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.0056,
"latency_p95_ms": 0.0074,
"latency_p99_ms": 0.0076,
"consistency": 1.0,
"total": 3,
"passed": 3,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.4385,
"ci_upper": 1.0
}
},
"by_difficulty": {
"easy": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.0066,
"latency_p95_ms": 0.0109,
"latency_p99_ms": 0.0116,
"consistency": 1.0,
"total": 5,
"passed": 5,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.5655,
"ci_upper": 1.0
},
"medium": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.0051,
"latency_p95_ms": 0.0132,
"latency_p99_ms": 0.0146,
"consistency": 1.0,
"total": 7,
"passed": 7,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.6457,
"ci_upper": 1.0
},
"hard": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.0076,
"latency_p95_ms": 0.0581,
"latency_p99_ms": 0.0626,
"consistency": 1.0,
"total": 3,
"passed": 3,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.4385,
"ci_upper": 1.0
}
},
"cases": [
{
"task_id": "prep-001",
"dimension": "preprocessing",
"category": "greeting",
"difficulty": "easy",
"passed": true,
"expected": "direct_chat",
"actual": "direct_chat",
"duration_ms": 0.0118,
"root_cause": "none",
"detail": "input='你好' method=regex_direct",
"consistency": 1.0
},
{
"task_id": "prep-002",
"dimension": "preprocessing",
"category": "greeting",
"difficulty": "easy",
"passed": true,
"expected": "direct_chat",
"actual": "direct_chat",
"duration_ms": 0.0071,
"root_cause": "none",
"detail": "input='hello' method=regex_direct",
"consistency": 1.0
},
{
"task_id": "prep-003",
"dimension": "preprocessing",
"category": "greeting",
"difficulty": "easy",
"passed": true,
"expected": "direct_chat",
"actual": "direct_chat",
"duration_ms": 0.0066,
"root_cause": "none",
"detail": "input='谢谢' method=regex_direct",
"consistency": 1.0
},
{
"task_id": "prep-004",
"dimension": "preprocessing",
"category": "greeting",
"difficulty": "easy",
"passed": true,
"expected": "direct_chat",
"actual": "direct_chat",
"duration_ms": 0.006,
"root_cause": "none",
"detail": "input='你是谁' method=regex_direct",
"consistency": 1.0
},
{
"task_id": "prep-005",
"dimension": "preprocessing",
"category": "tool_query",
"difficulty": "medium",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.0052,
"root_cause": "none",
"detail": "input='搜索golang教程' method=default_react",
"consistency": 1.0
},
{
"task_id": "prep-006",
"dimension": "preprocessing",
"category": "tool_query",
"difficulty": "medium",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.0046,
"root_cause": "none",
"detail": "input='执行ls命令' method=default_react",
"consistency": 1.0
},
{
"task_id": "prep-007",
"dimension": "preprocessing",
"category": "tool_query",
"difficulty": "medium",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.0051,
"root_cause": "none",
"detail": "input='翻译hello为中文' method=default_react",
"consistency": 1.0
},
{
"task_id": "prep-008",
"dimension": "preprocessing",
"category": "tool_query",
"difficulty": "medium",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.0051,
"root_cause": "none",
"detail": "input='什么是机器学习' method=default_react",
"consistency": 1.0
},
{
"task_id": "prep-009",
"dimension": "preprocessing",
"category": "tool_query",
"difficulty": "medium",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.0047,
"root_cause": "none",
"detail": "input='帮我分析数据' method=default_react",
"consistency": 1.0
},
{
"task_id": "prep-010",
"dimension": "preprocessing",
"category": "skill_prefix",
"difficulty": "medium",
"passed": true,
"expected": "skill_react",
"actual": "skill_react",
"duration_ms": 0.0149,
"root_cause": "none",
"detail": "input='@skill:react_agent 查看ip' method=skill_prefix",
"consistency": 1.0
},
{
"task_id": "prep-011",
"dimension": "preprocessing",
"category": "skill_prefix",
"difficulty": "medium",
"passed": true,
"expected": "direct_chat",
"actual": "direct_chat",
"duration_ms": 0.0092,
"root_cause": "none",
"detail": "input='@skill:chat_only 你好' method=skill_prefix",
"consistency": 1.0
},
{
"task_id": "prep-012",
"dimension": "preprocessing",
"category": "skill_prefix",
"difficulty": "hard",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.0637,
"root_cause": "none",
"detail": "input='@skill:nonexistent 做点什么' method=skill_not_found_fallback",
"consistency": 1.0
},
{
"task_id": "prep-013",
"dimension": "preprocessing",
"category": "complex",
"difficulty": "hard",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.0076,
"root_cause": "none",
"detail": "input='帮我分析这个数据并生成报告' method=default_react",
"consistency": 1.0
},
{
"task_id": "prep-014",
"dimension": "preprocessing",
"category": "complex",
"difficulty": "easy",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.0056,
"root_cause": "none",
"detail": "input='随便聊聊' method=default_react",
"consistency": 1.0
},
{
"task_id": "prep-015",
"dimension": "preprocessing",
"category": "complex",
"difficulty": "hard",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.0047,
"root_cause": "none",
"detail": "input='请帮我完成以下任务1. 查询天气 2. 生成报告' method=default_react",
"consistency": 1.0
}
]
},
"overfitting": {
"metrics": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.0426,
"latency_p95_ms": 0.0644,
"latency_p99_ms": 0.0675,
"consistency": 1.0,
"total": 5,
"passed": 5,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.5655,
"ci_upper": 1.0
},
"by_category": {
"ip_check": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.0426,
"latency_p95_ms": 0.0426,
"latency_p99_ms": 0.0426,
"consistency": 1.0,
"total": 1,
"passed": 1,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.2065,
"ci_upper": 1.0
},
"search": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.0309,
"latency_p95_ms": 0.0309,
"latency_p99_ms": 0.0309,
"consistency": 1.0,
"total": 1,
"passed": 1,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.2065,
"ci_upper": 1.0
},
"greeting": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.049,
"latency_p95_ms": 0.049,
"latency_p99_ms": 0.049,
"consistency": 1.0,
"total": 1,
"passed": 1,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.2065,
"ci_upper": 1.0
},
"tool_use": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.0252,
"latency_p95_ms": 0.0252,
"latency_p99_ms": 0.0252,
"consistency": 1.0,
"total": 1,
"passed": 1,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.2065,
"ci_upper": 1.0
},
"complex": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.0683,
"latency_p95_ms": 0.0683,
"latency_p99_ms": 0.0683,
"consistency": 1.0,
"total": 1,
"passed": 1,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.2065,
"ci_upper": 1.0
}
},
"by_difficulty": {
"medium": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.0309,
"latency_p95_ms": 0.0414,
"latency_p99_ms": 0.0424,
"consistency": 1.0,
"total": 3,
"passed": 3,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.4385,
"ci_upper": 1.0
},
"easy": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.049,
"latency_p95_ms": 0.049,
"latency_p99_ms": 0.049,
"consistency": 1.0,
"total": 1,
"passed": 1,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.2065,
"ci_upper": 1.0
},
"hard": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.0683,
"latency_p95_ms": 0.0683,
"latency_p99_ms": 0.0683,
"consistency": 1.0,
"total": 1,
"passed": 1,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.2065,
"ci_upper": 1.0
}
},
"cases": [
{
"task_id": "over-001",
"dimension": "overfitting",
"category": "ip_check",
"difficulty": "medium",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.0426,
"root_cause": "none",
"detail": "paraphrases=5 modes=['react', 'react', 'react', 'react', 'react']",
"consistency": 1.0
},
{
"task_id": "over-002",
"dimension": "overfitting",
"category": "search",
"difficulty": "medium",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.0309,
"root_cause": "none",
"detail": "paraphrases=3 modes=['react', 'react', 'react']",
"consistency": 1.0
},
{
"task_id": "over-003",
"dimension": "overfitting",
"category": "greeting",
"difficulty": "easy",
"passed": true,
"expected": "direct_chat",
"actual": "direct_chat",
"duration_ms": 0.049,
"root_cause": "none",
"detail": "paraphrases=5 modes=['direct_chat', 'direct_chat', 'direct_chat', 'direct_chat', 'direct_chat']",
"consistency": 1.0
},
{
"task_id": "over-004",
"dimension": "overfitting",
"category": "tool_use",
"difficulty": "medium",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.0252,
"root_cause": "none",
"detail": "paraphrases=3 modes=['react', 'react', 'react']",
"consistency": 1.0
},
{
"task_id": "over-005",
"dimension": "overfitting",
"category": "complex",
"difficulty": "hard",
"passed": true,
"expected": "react",
"actual": "react",
"duration_ms": 0.0683,
"root_cause": "none",
"detail": "paraphrases=3 modes=['react', 'react', 'react']",
"consistency": 1.0
}
]
},
"efficiency": {
"metrics": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 0.4,
"latency_p95_ms": 0.768,
"latency_p99_ms": 0.8176,
"consistency": 1.0,
"total": 5,
"passed": 5,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.5655,
"ci_upper": 1.0
},
"by_category": {
"preprocess_latency": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 0.4,
"latency_p95_ms": 0.508,
"latency_p99_ms": 0.5176,
"consistency": 1.0,
"total": 3,
"passed": 3,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.4385,
"ci_upper": 1.0
},
"tool_search_latency": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 0.44,
"latency_p95_ms": 0.791,
"latency_p99_ms": 0.8222,
"consistency": 1.0,
"total": 2,
"passed": 2,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.3424,
"ci_upper": 1.0
}
},
"by_difficulty": {
"easy": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 0.2,
"latency_p95_ms": 0.335,
"latency_p99_ms": 0.347,
"consistency": 1.0,
"total": 2,
"passed": 2,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.3424,
"ci_upper": 1.0
},
"medium": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 0.52,
"latency_p95_ms": 0.799,
"latency_p99_ms": 0.8238,
"consistency": 1.0,
"total": 3,
"passed": 3,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.4385,
"ci_upper": 1.0
}
},
"cases": [
{
"task_id": "eff-001",
"dimension": "efficiency",
"category": "preprocess_latency",
"difficulty": "easy",
"passed": true,
"expected": "<=50ms",
"actual": "0.004ms",
"duration_ms": 0.35,
"root_cause": "none",
"detail": "iterations=100 avg=0.004ms threshold=50.0ms",
"consistency": 1.0
},
{
"task_id": "eff-002",
"dimension": "efficiency",
"category": "preprocess_latency",
"difficulty": "medium",
"passed": true,
"expected": "<=50ms",
"actual": "0.004ms",
"duration_ms": 0.4,
"root_cause": "none",
"detail": "iterations=100 avg=0.004ms threshold=50.0ms",
"consistency": 1.0
},
{
"task_id": "eff-003",
"dimension": "efficiency",
"category": "preprocess_latency",
"difficulty": "medium",
"passed": true,
"expected": "<=50ms",
"actual": "0.005ms",
"duration_ms": 0.52,
"root_cause": "none",
"detail": "iterations=100 avg=0.005ms threshold=50.0ms",
"consistency": 1.0
},
{
"task_id": "eff-004",
"dimension": "efficiency",
"category": "tool_search_latency",
"difficulty": "medium",
"passed": true,
"expected": "<=10ms",
"actual": "0.008ms",
"duration_ms": 0.83,
"root_cause": "none",
"detail": "iterations=100 avg=0.008ms threshold=10.0ms",
"consistency": 1.0
},
{
"task_id": "eff-005",
"dimension": "efficiency",
"category": "tool_search_latency",
"difficulty": "easy",
"passed": true,
"expected": "<=5ms",
"actual": "0.000ms",
"duration_ms": 0.05,
"root_cause": "none",
"detail": "iterations=100 avg=0.000ms threshold=5.0ms",
"consistency": 1.0
}
]
},
"tool_search": {
"metrics": {
"accuracy": 1.0,
"precision": 0.8333,
"recall": 0.8333,
"f1": 0.8333,
"latency_p50_ms": 0.0112,
"latency_p95_ms": 0.0153,
"latency_p99_ms": 0.0163,
"consistency": 1.0,
"total": 10,
"passed": 10,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.7225,
"ci_upper": 1.0
},
"by_category": {
"exact_match": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.0124,
"latency_p95_ms": 0.016,
"latency_p99_ms": 0.0165,
"consistency": 1.0,
"total": 5,
"passed": 5,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.5655,
"ci_upper": 1.0
},
"fuzzy_match": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.0108,
"latency_p95_ms": 0.0111,
"latency_p99_ms": 0.0111,
"consistency": 1.0,
"total": 2,
"passed": 2,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.3424,
"ci_upper": 1.0
},
"no_match": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 0.0044,
"latency_p95_ms": 0.0071,
"latency_p99_ms": 0.0073,
"consistency": 1.0,
"total": 2,
"passed": 2,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.3424,
"ci_upper": 1.0
},
"top_k": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.0091,
"latency_p95_ms": 0.0091,
"latency_p99_ms": 0.0091,
"consistency": 1.0,
"total": 1,
"passed": 1,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.2065,
"ci_upper": 1.0
}
},
"by_difficulty": {
"easy": {
"accuracy": 1.0,
"precision": 0.8333,
"recall": 0.8333,
"f1": 0.8333,
"latency_p50_ms": 0.0124,
"latency_p95_ms": 0.0158,
"latency_p99_ms": 0.0164,
"consistency": 1.0,
"total": 7,
"passed": 7,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.6457,
"ci_upper": 1.0
},
"medium": {
"accuracy": 1.0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0,
"latency_p50_ms": 0.0105,
"latency_p95_ms": 0.011,
"latency_p99_ms": 0.0111,
"consistency": 1.0,
"total": 3,
"passed": 3,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.4385,
"ci_upper": 1.0
}
},
"cases": [
{
"task_id": "ts-001",
"dimension": "tool_search",
"category": "exact_match",
"difficulty": "easy",
"passed": true,
"expected": "read_file",
"actual": "read_file",
"duration_ms": 0.0166,
"root_cause": "none",
"detail": "query='read file' top_k=5 results=2",
"consistency": 1.0
},
{
"task_id": "ts-002",
"dimension": "tool_search",
"category": "exact_match",
"difficulty": "easy",
"passed": true,
"expected": "write_file",
"actual": "write_file",
"duration_ms": 0.0138,
"root_cause": "none",
"detail": "query='write file content' top_k=5 results=2",
"consistency": 1.0
},
{
"task_id": "ts-003",
"dimension": "tool_search",
"category": "exact_match",
"difficulty": "easy",
"passed": true,
"expected": "web_search",
"actual": "web_search",
"duration_ms": 0.0124,
"root_cause": "none",
"detail": "query='search web information' top_k=5 results=2",
"consistency": 1.0
},
{
"task_id": "ts-004",
"dimension": "tool_search",
"category": "exact_match",
"difficulty": "easy",
"passed": true,
"expected": "shell_exec",
"actual": "shell_exec",
"duration_ms": 0.0113,
"root_cause": "none",
"detail": "query='execute shell command' top_k=5 results=1",
"consistency": 1.0
},
{
"task_id": "ts-005",
"dimension": "tool_search",
"category": "exact_match",
"difficulty": "easy",
"passed": true,
"expected": "http_request",
"actual": "http_request",
"duration_ms": 0.0124,
"root_cause": "none",
"detail": "query='send http request url' top_k=5 results=1",
"consistency": 1.0
},
{
"task_id": "ts-006",
"dimension": "tool_search",
"category": "fuzzy_match",
"difficulty": "medium",
"passed": true,
"expected": "read_file",
"actual": "read_file",
"duration_ms": 0.0105,
"root_cause": "none",
"detail": "query='io file' top_k=5 results=2",
"consistency": 1.0
},
{
"task_id": "ts-007",
"dimension": "tool_search",
"category": "fuzzy_match",
"difficulty": "medium",
"passed": true,
"expected": "web_search",
"actual": "web_search",
"duration_ms": 0.0111,
"root_cause": "none",
"detail": "query='search query engine' top_k=5 results=1",
"consistency": 1.0
},
{
"task_id": "ts-008",
"dimension": "tool_search",
"category": "no_match",
"difficulty": "easy",
"passed": true,
"expected": "__none__",
"actual": "[]",
"duration_ms": 0.0015,
"root_cause": "none",
"detail": "query='' top_k=5 results=0",
"consistency": 1.0
},
{
"task_id": "ts-009",
"dimension": "tool_search",
"category": "no_match",
"difficulty": "easy",
"passed": true,
"expected": "__none__",
"actual": "[]",
"duration_ms": 0.0074,
"root_cause": "none",
"detail": "query='zzzznonexistent' top_k=5 results=0",
"consistency": 1.0
},
{
"task_id": "ts-010",
"dimension": "tool_search",
"category": "top_k",
"difficulty": "medium",
"passed": true,
"expected": "read_file",
"actual": "read_file",
"duration_ms": 0.0091,
"root_cause": "none",
"detail": "query='file' top_k=1 results=1",
"consistency": 1.0
}
]
},
"event_model": {
"metrics": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 0.0409,
"latency_p95_ms": 15.6839,
"latency_p99_ms": 19.8446,
"consistency": 1.0,
"total": 6,
"passed": 6,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.6097,
"ci_upper": 1.0
},
"by_category": {
"sq_lifecycle": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 0.038,
"latency_p95_ms": 0.0773,
"latency_p99_ms": 0.0808,
"consistency": 1.0,
"total": 3,
"passed": 3,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.4385,
"ci_upper": 1.0
},
"eq_lifecycle": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 0.0438,
"latency_p95_ms": 18.8006,
"latency_p99_ms": 20.4679,
"consistency": 1.0,
"total": 3,
"passed": 3,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.4385,
"ci_upper": 1.0
}
},
"by_difficulty": {
"easy": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 0.0409,
"latency_p95_ms": 15.6839,
"latency_p99_ms": 19.8446,
"consistency": 1.0,
"total": 6,
"passed": 6,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.6097,
"ci_upper": 1.0
}
},
"cases": [
{
"task_id": "ev-001",
"dimension": "event_model",
"category": "sq_lifecycle",
"difficulty": "easy",
"passed": true,
"expected": "passed",
"actual": "drained=['hello']",
"duration_ms": 0.0817,
"root_cause": "none",
"detail": "task_id=b0a1c409...",
"consistency": 1.0
},
{
"task_id": "ev-002",
"dimension": "event_model",
"category": "sq_lifecycle",
"difficulty": "easy",
"passed": true,
"expected": "passed",
"actual": "cancelled=True",
"duration_ms": 0.038,
"root_cause": "none",
"detail": "",
"consistency": 1.0
},
{
"task_id": "ev-003",
"dimension": "event_model",
"category": "sq_lifecycle",
"difficulty": "easy",
"passed": true,
"expected": "passed",
"actual": "raised=True closed=True",
"duration_ms": 0.0091,
"root_cause": "none",
"detail": "",
"consistency": 1.0
},
{
"task_id": "ev-004",
"dimension": "event_model",
"category": "eq_lifecycle",
"difficulty": "easy",
"passed": true,
"expected": "passed",
"actual": "received=1",
"duration_ms": 0.0438,
"root_cause": "none",
"detail": "",
"consistency": 1.0
},
{
"task_id": "ev-005",
"dimension": "event_model",
"category": "eq_lifecycle",
"difficulty": "easy",
"passed": true,
"expected": "passed",
"actual": "events=1 closed=True",
"duration_ms": 20.8847,
"root_cause": "none",
"detail": "",
"consistency": 1.0
},
{
"task_id": "ev-006",
"dimension": "event_model",
"category": "eq_lifecycle",
"difficulty": "easy",
"passed": true,
"expected": "passed",
"actual": "subscribers=0",
"duration_ms": 0.0045,
"root_cause": "none",
"detail": "",
"consistency": 1.0
}
]
},
"spec_management": {
"metrics": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 1.414,
"latency_p95_ms": 3.5951,
"latency_p99_ms": 4.0383,
"consistency": 1.0,
"total": 7,
"passed": 7,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.6457,
"ci_upper": 1.0
},
"by_category": {
"crud": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 1.414,
"latency_p95_ms": 3.6332,
"latency_p99_ms": 4.0459,
"consistency": 1.0,
"total": 5,
"passed": 5,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.5655,
"ci_upper": 1.0
},
"edge": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 1.1783,
"latency_p95_ms": 2.1899,
"latency_p99_ms": 2.2798,
"consistency": 1.0,
"total": 2,
"passed": 2,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.3424,
"ci_upper": 1.0
}
},
"by_difficulty": {
"easy": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 1.3787,
"latency_p95_ms": 3.5042,
"latency_p99_ms": 4.0201,
"consistency": 1.0,
"total": 6,
"passed": 6,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.6097,
"ci_upper": 1.0
},
"medium": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 2.3023,
"latency_p95_ms": 2.3023,
"latency_p99_ms": 2.3023,
"consistency": 1.0,
"total": 1,
"passed": 1,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.2065,
"ci_upper": 1.0
}
},
"cases": [
{
"task_id": "sm-001",
"dimension": "spec_management",
"category": "crud",
"difficulty": "easy",
"passed": true,
"expected": "passed",
"actual": "exists=True",
"duration_ms": 1.414,
"root_cause": "none",
"detail": "path=/var/folders/6b/ljk5bdq50yxcsth24frf05200000gn/T/agentkit-benchmark-pz2hpb1l/run-2/specs/sm-001/test-spec.yaml",
"consistency": 1.0
},
{
"task_id": "sm-002",
"dimension": "spec_management",
"category": "crud",
"difficulty": "easy",
"passed": true,
"expected": "passed",
"actual": "steps=2",
"duration_ms": 1.3435,
"root_cause": "none",
"detail": "",
"consistency": 1.0
},
{
"task_id": "sm-003",
"dimension": "spec_management",
"category": "crud",
"difficulty": "easy",
"passed": true,
"expected": "passed",
"actual": "goal=Updated goal",
"duration_ms": 1.5695,
"root_cause": "none",
"detail": "",
"consistency": 1.0
},
{
"task_id": "sm-004",
"dimension": "spec_management",
"category": "crud",
"difficulty": "easy",
"passed": true,
"expected": "passed",
"actual": "deleted=True remaining=0",
"duration_ms": 1.1556,
"root_cause": "none",
"detail": "",
"consistency": 1.0
},
{
"task_id": "sm-005",
"dimension": "spec_management",
"category": "crud",
"difficulty": "easy",
"passed": true,
"expected": "passed",
"actual": "count=2",
"duration_ms": 4.1491,
"root_cause": "none",
"detail": "",
"consistency": 1.0
},
{
"task_id": "sm-006",
"dimension": "spec_management",
"category": "edge",
"difficulty": "medium",
"passed": true,
"expected": "passed",
"actual": "status=confirmed",
"duration_ms": 2.3023,
"root_cause": "none",
"detail": "",
"consistency": 1.0
},
{
"task_id": "sm-007",
"dimension": "spec_management",
"category": "edge",
"difficulty": "easy",
"passed": true,
"expected": "passed",
"actual": "result=None",
"duration_ms": 0.0544,
"root_cause": "none",
"detail": "",
"consistency": 1.0
}
]
},
"verification": {
"metrics": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 25.4393,
"latency_p95_ms": 413.4245,
"latency_p99_ms": 488.3185,
"consistency": 1.0,
"total": 5,
"passed": 5,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.5655,
"ci_upper": 1.0
},
"by_category": {
"basic": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 12.9474,
"latency_p95_ms": 13.0775,
"latency_p99_ms": 13.0891,
"consistency": 1.0,
"total": 2,
"passed": 2,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.3424,
"ci_upper": 1.0
},
"retry": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 38.9547,
"latency_p95_ms": 38.9547,
"latency_p99_ms": 38.9547,
"consistency": 1.0,
"total": 1,
"passed": 1,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.2065,
"ci_upper": 1.0
},
"timeout": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 507.042,
"latency_p95_ms": 507.042,
"latency_p99_ms": 507.042,
"consistency": 1.0,
"total": 1,
"passed": 1,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.2065,
"ci_upper": 1.0
},
"multi": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 25.4393,
"latency_p95_ms": 25.4393,
"latency_p99_ms": 25.4393,
"consistency": 1.0,
"total": 1,
"passed": 1,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.2065,
"ci_upper": 1.0
}
},
"by_difficulty": {
"easy": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 12.9474,
"latency_p95_ms": 13.0775,
"latency_p99_ms": 13.0891,
"consistency": 1.0,
"total": 2,
"passed": 2,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.3424,
"ci_upper": 1.0
},
"medium": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 38.9547,
"latency_p95_ms": 460.2333,
"latency_p99_ms": 497.6803,
"consistency": 1.0,
"total": 3,
"passed": 3,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.4385,
"ci_upper": 1.0
}
},
"cases": [
{
"task_id": "vf-001",
"dimension": "verification",
"category": "basic",
"difficulty": "easy",
"passed": true,
"expected": "passed",
"actual": "passed=True attempts=1",
"duration_ms": 13.092,
"root_cause": "none",
"detail": "",
"consistency": 1.0
},
{
"task_id": "vf-002",
"dimension": "verification",
"category": "basic",
"difficulty": "easy",
"passed": true,
"expected": "passed",
"actual": "passed=False errors=1",
"duration_ms": 12.8029,
"root_cause": "none",
"detail": "",
"consistency": 1.0
},
{
"task_id": "vf-003",
"dimension": "verification",
"category": "retry",
"difficulty": "medium",
"passed": true,
"expected": "passed",
"actual": "attempts=3 callbacks=2",
"duration_ms": 38.9547,
"root_cause": "none",
"detail": "",
"consistency": 1.0
},
{
"task_id": "vf-004",
"dimension": "verification",
"category": "timeout",
"difficulty": "medium",
"passed": true,
"expected": "passed",
"actual": "passed=False errors=1",
"duration_ms": 507.042,
"root_cause": "none",
"detail": "errors=['Command timed out after 0.5s: sleep 10']",
"consistency": 1.0
},
{
"task_id": "vf-005",
"dimension": "verification",
"category": "multi",
"difficulty": "medium",
"passed": true,
"expected": "passed",
"actual": "passed=False",
"duration_ms": 25.4393,
"root_cause": "none",
"detail": "",
"consistency": 1.0
}
]
}
},
"baseline_comparison": {
"status": "compared",
"dimensions": {
"preprocessing": {
"baseline_accuracy": 1.0,
"current_accuracy": 1.0,
"change": 0.0,
"direction": "—"
},
"overfitting": {
"baseline_accuracy": 1.0,
"current_accuracy": 1.0,
"change": 0.0,
"direction": "—"
},
"efficiency": {
"baseline_accuracy": 1.0,
"current_accuracy": 1.0,
"change": 0.0,
"direction": "—"
},
"tool_search": {
"baseline_accuracy": 1.0,
"current_accuracy": 1.0,
"change": 0.0,
"direction": "—"
},
"event_model": {
"baseline_accuracy": 1.0,
"current_accuracy": 1.0,
"change": 0.0,
"direction": "—"
},
"spec_management": {
"baseline_accuracy": 1.0,
"current_accuracy": 1.0,
"change": 0.0,
"direction": "—"
},
"verification": {
"baseline_accuracy": 1.0,
"current_accuracy": 1.0,
"change": 0.0,
"direction": "—"
}
}
}
}