fischer-agentkit/test-results/benchmark/benchmark_report.json

239 lines
6.8 KiB
JSON

{
"timestamp": "2026-06-20T11:05:39.446588+00:00",
"version": "0.1.0",
"mode": "llm",
"runs": 3,
"fast": false,
"overall_accuracy": 0.8,
"overall_accuracy_mean": 0.9333,
"overall_accuracy_std": 0.0,
"summary": "4/5 tests passed (1 failed) across 1 dimensions.",
"dimensions": {
"llm_reasoning": {
"metrics": {
"accuracy": 0.8,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 40798.4485,
"latency_p95_ms": 56307.9299,
"latency_p99_ms": 59262.5279,
"consistency": 1.0,
"total": 5,
"passed": 4,
"failed": 1,
"accuracy_mean": 0.9333,
"accuracy_std": 0.0943,
"ci_lower": 0.3755,
"ci_upper": 0.9638
},
"by_category": {
"intent_understanding": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 32004.2511,
"latency_p95_ms": 32004.2511,
"latency_p99_ms": 32004.2511,
"consistency": 1.0,
"total": 1,
"passed": 1,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.2065,
"ci_upper": 1.0
},
"tool_selection": {
"accuracy": 0.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 60001.1774,
"latency_p95_ms": 60001.1774,
"latency_p99_ms": 60001.1774,
"consistency": 1.0,
"total": 1,
"passed": 0,
"failed": 1,
"accuracy_mean": 0.0,
"accuracy_std": 0.0,
"ci_lower": 0.0,
"ci_upper": 0.7935
},
"multi_step": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 36994.9937,
"latency_p95_ms": 36994.9937,
"latency_p99_ms": 36994.9937,
"consistency": 1.0,
"total": 1,
"passed": 1,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.2065,
"ci_upper": 1.0
},
"code_generation": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 41534.9401,
"latency_p95_ms": 41534.9401,
"latency_p99_ms": 41534.9401,
"consistency": 1.0,
"total": 1,
"passed": 1,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.2065,
"ci_upper": 1.0
},
"error_recovery": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 40798.4485,
"latency_p95_ms": 40798.4485,
"latency_p99_ms": 40798.4485,
"consistency": 1.0,
"total": 1,
"passed": 1,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.2065,
"ci_upper": 1.0
}
},
"by_difficulty": {
"easy": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 32004.2511,
"latency_p95_ms": 32004.2511,
"latency_p99_ms": 32004.2511,
"consistency": 1.0,
"total": 1,
"passed": 1,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.2065,
"ci_upper": 1.0
},
"medium": {
"accuracy": 0.5,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 50768.0587,
"latency_p95_ms": 59077.8655,
"latency_p99_ms": 59816.515,
"consistency": 1.0,
"total": 2,
"passed": 1,
"failed": 1,
"accuracy_mean": 0.5,
"accuracy_std": 0.0,
"ci_lower": 0.0945,
"ci_upper": 0.9055
},
"hard": {
"accuracy": 1.0,
"precision": 0.0,
"recall": 0.0,
"f1": 0.0,
"latency_p50_ms": 38896.7211,
"latency_p95_ms": 40608.2758,
"latency_p99_ms": 40760.414,
"consistency": 1.0,
"total": 2,
"passed": 2,
"failed": 0,
"accuracy_mean": 1.0,
"accuracy_std": 0.0,
"ci_lower": 0.3424,
"ci_upper": 1.0
}
},
"cases": [
{
"task_id": "llm-001",
"dimension": "llm_reasoning",
"category": "intent_understanding",
"difficulty": "easy",
"passed": true,
"expected": "react",
"actual": "mode=react tokens=1249 len=895",
"duration_ms": 32004.2511,
"root_cause": "none",
"detail": "mode=react keywords=['ip', '地址', 'ifconfig', 'hostname', '网络'] stream=False",
"consistency": 1.0
},
{
"task_id": "llm-002",
"dimension": "llm_reasoning",
"category": "tool_selection",
"difficulty": "medium",
"passed": false,
"expected": "react",
"actual": "timeout",
"duration_ms": 60001.1774,
"root_cause": "timeout",
"detail": "LLM call timed out after 60.0s",
"consistency": 1.0
},
{
"task_id": "llm-003",
"dimension": "llm_reasoning",
"category": "multi_step",
"difficulty": "hard",
"passed": true,
"expected": "react",
"actual": "mode=react tokens=0 len=28",
"duration_ms": 36994.9937,
"root_cause": "none",
"detail": "mode=react keywords=['fib', '递归', '优化', '缓存', 'memo', '迭代', '动态规划', '性能'] stream=True",
"consistency": 1.0
},
{
"task_id": "llm-004",
"dimension": "llm_reasoning",
"category": "code_generation",
"difficulty": "medium",
"passed": true,
"expected": "react",
"actual": "mode=react tokens=2103 len=1517",
"duration_ms": 41534.9401,
"root_cause": "none",
"detail": "mode=react keywords=['def', 'fib', 'return', 'python'] stream=False",
"consistency": 1.0
},
{
"task_id": "llm-005",
"dimension": "llm_reasoning",
"category": "error_recovery",
"difficulty": "hard",
"passed": true,
"expected": "react",
"actual": "mode=react tokens=0 len=52",
"duration_ms": 40798.4485,
"root_cause": "none",
"detail": "mode=react keywords=['pip', 'install', 'agentkit', '安装', '模块'] stream=True",
"consistency": 1.0
}
]
}
}
}