239 lines
6.8 KiB
JSON
239 lines
6.8 KiB
JSON
{
|
|
"timestamp": "2026-06-20T11:05:39.446588+00:00",
|
|
"version": "0.1.0",
|
|
"mode": "llm",
|
|
"runs": 3,
|
|
"fast": false,
|
|
"overall_accuracy": 0.8,
|
|
"overall_accuracy_mean": 0.9333,
|
|
"overall_accuracy_std": 0.0,
|
|
"summary": "4/5 tests passed (1 failed) across 1 dimensions.",
|
|
"dimensions": {
|
|
"llm_reasoning": {
|
|
"metrics": {
|
|
"accuracy": 0.8,
|
|
"precision": 0.0,
|
|
"recall": 0.0,
|
|
"f1": 0.0,
|
|
"latency_p50_ms": 40798.4485,
|
|
"latency_p95_ms": 56307.9299,
|
|
"latency_p99_ms": 59262.5279,
|
|
"consistency": 1.0,
|
|
"total": 5,
|
|
"passed": 4,
|
|
"failed": 1,
|
|
"accuracy_mean": 0.9333,
|
|
"accuracy_std": 0.0943,
|
|
"ci_lower": 0.3755,
|
|
"ci_upper": 0.9638
|
|
},
|
|
"by_category": {
|
|
"intent_understanding": {
|
|
"accuracy": 1.0,
|
|
"precision": 0.0,
|
|
"recall": 0.0,
|
|
"f1": 0.0,
|
|
"latency_p50_ms": 32004.2511,
|
|
"latency_p95_ms": 32004.2511,
|
|
"latency_p99_ms": 32004.2511,
|
|
"consistency": 1.0,
|
|
"total": 1,
|
|
"passed": 1,
|
|
"failed": 0,
|
|
"accuracy_mean": 1.0,
|
|
"accuracy_std": 0.0,
|
|
"ci_lower": 0.2065,
|
|
"ci_upper": 1.0
|
|
},
|
|
"tool_selection": {
|
|
"accuracy": 0.0,
|
|
"precision": 0.0,
|
|
"recall": 0.0,
|
|
"f1": 0.0,
|
|
"latency_p50_ms": 60001.1774,
|
|
"latency_p95_ms": 60001.1774,
|
|
"latency_p99_ms": 60001.1774,
|
|
"consistency": 1.0,
|
|
"total": 1,
|
|
"passed": 0,
|
|
"failed": 1,
|
|
"accuracy_mean": 0.0,
|
|
"accuracy_std": 0.0,
|
|
"ci_lower": 0.0,
|
|
"ci_upper": 0.7935
|
|
},
|
|
"multi_step": {
|
|
"accuracy": 1.0,
|
|
"precision": 0.0,
|
|
"recall": 0.0,
|
|
"f1": 0.0,
|
|
"latency_p50_ms": 36994.9937,
|
|
"latency_p95_ms": 36994.9937,
|
|
"latency_p99_ms": 36994.9937,
|
|
"consistency": 1.0,
|
|
"total": 1,
|
|
"passed": 1,
|
|
"failed": 0,
|
|
"accuracy_mean": 1.0,
|
|
"accuracy_std": 0.0,
|
|
"ci_lower": 0.2065,
|
|
"ci_upper": 1.0
|
|
},
|
|
"code_generation": {
|
|
"accuracy": 1.0,
|
|
"precision": 0.0,
|
|
"recall": 0.0,
|
|
"f1": 0.0,
|
|
"latency_p50_ms": 41534.9401,
|
|
"latency_p95_ms": 41534.9401,
|
|
"latency_p99_ms": 41534.9401,
|
|
"consistency": 1.0,
|
|
"total": 1,
|
|
"passed": 1,
|
|
"failed": 0,
|
|
"accuracy_mean": 1.0,
|
|
"accuracy_std": 0.0,
|
|
"ci_lower": 0.2065,
|
|
"ci_upper": 1.0
|
|
},
|
|
"error_recovery": {
|
|
"accuracy": 1.0,
|
|
"precision": 0.0,
|
|
"recall": 0.0,
|
|
"f1": 0.0,
|
|
"latency_p50_ms": 40798.4485,
|
|
"latency_p95_ms": 40798.4485,
|
|
"latency_p99_ms": 40798.4485,
|
|
"consistency": 1.0,
|
|
"total": 1,
|
|
"passed": 1,
|
|
"failed": 0,
|
|
"accuracy_mean": 1.0,
|
|
"accuracy_std": 0.0,
|
|
"ci_lower": 0.2065,
|
|
"ci_upper": 1.0
|
|
}
|
|
},
|
|
"by_difficulty": {
|
|
"easy": {
|
|
"accuracy": 1.0,
|
|
"precision": 0.0,
|
|
"recall": 0.0,
|
|
"f1": 0.0,
|
|
"latency_p50_ms": 32004.2511,
|
|
"latency_p95_ms": 32004.2511,
|
|
"latency_p99_ms": 32004.2511,
|
|
"consistency": 1.0,
|
|
"total": 1,
|
|
"passed": 1,
|
|
"failed": 0,
|
|
"accuracy_mean": 1.0,
|
|
"accuracy_std": 0.0,
|
|
"ci_lower": 0.2065,
|
|
"ci_upper": 1.0
|
|
},
|
|
"medium": {
|
|
"accuracy": 0.5,
|
|
"precision": 0.0,
|
|
"recall": 0.0,
|
|
"f1": 0.0,
|
|
"latency_p50_ms": 50768.0587,
|
|
"latency_p95_ms": 59077.8655,
|
|
"latency_p99_ms": 59816.515,
|
|
"consistency": 1.0,
|
|
"total": 2,
|
|
"passed": 1,
|
|
"failed": 1,
|
|
"accuracy_mean": 0.5,
|
|
"accuracy_std": 0.0,
|
|
"ci_lower": 0.0945,
|
|
"ci_upper": 0.9055
|
|
},
|
|
"hard": {
|
|
"accuracy": 1.0,
|
|
"precision": 0.0,
|
|
"recall": 0.0,
|
|
"f1": 0.0,
|
|
"latency_p50_ms": 38896.7211,
|
|
"latency_p95_ms": 40608.2758,
|
|
"latency_p99_ms": 40760.414,
|
|
"consistency": 1.0,
|
|
"total": 2,
|
|
"passed": 2,
|
|
"failed": 0,
|
|
"accuracy_mean": 1.0,
|
|
"accuracy_std": 0.0,
|
|
"ci_lower": 0.3424,
|
|
"ci_upper": 1.0
|
|
}
|
|
},
|
|
"cases": [
|
|
{
|
|
"task_id": "llm-001",
|
|
"dimension": "llm_reasoning",
|
|
"category": "intent_understanding",
|
|
"difficulty": "easy",
|
|
"passed": true,
|
|
"expected": "react",
|
|
"actual": "mode=react tokens=1249 len=895",
|
|
"duration_ms": 32004.2511,
|
|
"root_cause": "none",
|
|
"detail": "mode=react keywords=['ip', '地址', 'ifconfig', 'hostname', '网络'] stream=False",
|
|
"consistency": 1.0
|
|
},
|
|
{
|
|
"task_id": "llm-002",
|
|
"dimension": "llm_reasoning",
|
|
"category": "tool_selection",
|
|
"difficulty": "medium",
|
|
"passed": false,
|
|
"expected": "react",
|
|
"actual": "timeout",
|
|
"duration_ms": 60001.1774,
|
|
"root_cause": "timeout",
|
|
"detail": "LLM call timed out after 60.0s",
|
|
"consistency": 1.0
|
|
},
|
|
{
|
|
"task_id": "llm-003",
|
|
"dimension": "llm_reasoning",
|
|
"category": "multi_step",
|
|
"difficulty": "hard",
|
|
"passed": true,
|
|
"expected": "react",
|
|
"actual": "mode=react tokens=0 len=28",
|
|
"duration_ms": 36994.9937,
|
|
"root_cause": "none",
|
|
"detail": "mode=react keywords=['fib', '递归', '优化', '缓存', 'memo', '迭代', '动态规划', '性能'] stream=True",
|
|
"consistency": 1.0
|
|
},
|
|
{
|
|
"task_id": "llm-004",
|
|
"dimension": "llm_reasoning",
|
|
"category": "code_generation",
|
|
"difficulty": "medium",
|
|
"passed": true,
|
|
"expected": "react",
|
|
"actual": "mode=react tokens=2103 len=1517",
|
|
"duration_ms": 41534.9401,
|
|
"root_cause": "none",
|
|
"detail": "mode=react keywords=['def', 'fib', 'return', 'python'] stream=False",
|
|
"consistency": 1.0
|
|
},
|
|
{
|
|
"task_id": "llm-005",
|
|
"dimension": "llm_reasoning",
|
|
"category": "error_recovery",
|
|
"difficulty": "hard",
|
|
"passed": true,
|
|
"expected": "react",
|
|
"actual": "mode=react tokens=0 len=52",
|
|
"duration_ms": 40798.4485,
|
|
"root_cause": "none",
|
|
"detail": "mode=react keywords=['pip', 'install', 'agentkit', '安装', '模块'] stream=True",
|
|
"consistency": 1.0
|
|
}
|
|
]
|
|
}
|
|
}
|
|
} |