239 lines
6.8 KiB
JSON
239 lines
6.8 KiB
JSON
{
|
|
"timestamp": "2026-06-20T03:18:35.937935+00:00",
|
|
"version": "0.1.0",
|
|
"mode": "llm",
|
|
"runs": 1,
|
|
"fast": false,
|
|
"overall_accuracy": 0.6,
|
|
"overall_accuracy_mean": 0.6,
|
|
"overall_accuracy_std": 0.0,
|
|
"summary": "3/5 tests passed (2 failed) across 1 dimensions.",
|
|
"dimensions": {
|
|
"llm_reasoning": {
|
|
"metrics": {
|
|
"accuracy": 0.6,
|
|
"precision": 0.0,
|
|
"recall": 0.0,
|
|
"f1": 0.0,
|
|
"latency_p50_ms": 35309.3238,
|
|
"latency_p95_ms": 41704.3855,
|
|
"latency_p99_ms": 42044.7604,
|
|
"consistency": 1.0,
|
|
"total": 5,
|
|
"passed": 3,
|
|
"failed": 2,
|
|
"accuracy_mean": 0.6,
|
|
"accuracy_std": 0.0,
|
|
"ci_lower": 0.2307,
|
|
"ci_upper": 0.8824
|
|
},
|
|
"by_category": {
|
|
"intent_understanding": {
|
|
"accuracy": 0.0,
|
|
"precision": 0.0,
|
|
"recall": 0.0,
|
|
"f1": 0.0,
|
|
"latency_p50_ms": 20004.7078,
|
|
"latency_p95_ms": 20004.7078,
|
|
"latency_p99_ms": 20004.7078,
|
|
"consistency": 1.0,
|
|
"total": 1,
|
|
"passed": 0,
|
|
"failed": 1,
|
|
"accuracy_mean": 0.0,
|
|
"accuracy_std": 0.0,
|
|
"ci_lower": 0.0,
|
|
"ci_upper": 0.7935
|
|
},
|
|
"tool_selection": {
|
|
"accuracy": 1.0,
|
|
"precision": 0.0,
|
|
"recall": 0.0,
|
|
"f1": 0.0,
|
|
"latency_p50_ms": 5338.8459,
|
|
"latency_p95_ms": 5338.8459,
|
|
"latency_p99_ms": 5338.8459,
|
|
"consistency": 1.0,
|
|
"total": 1,
|
|
"passed": 1,
|
|
"failed": 0,
|
|
"accuracy_mean": 1.0,
|
|
"accuracy_std": 0.0,
|
|
"ci_lower": 0.2065,
|
|
"ci_upper": 1.0
|
|
},
|
|
"multi_step": {
|
|
"accuracy": 1.0,
|
|
"precision": 0.0,
|
|
"recall": 0.0,
|
|
"f1": 0.0,
|
|
"latency_p50_ms": 42129.8541,
|
|
"latency_p95_ms": 42129.8541,
|
|
"latency_p99_ms": 42129.8541,
|
|
"consistency": 1.0,
|
|
"total": 1,
|
|
"passed": 1,
|
|
"failed": 0,
|
|
"accuracy_mean": 1.0,
|
|
"accuracy_std": 0.0,
|
|
"ci_lower": 0.2065,
|
|
"ci_upper": 1.0
|
|
},
|
|
"code_generation": {
|
|
"accuracy": 0.0,
|
|
"precision": 0.0,
|
|
"recall": 0.0,
|
|
"f1": 0.0,
|
|
"latency_p50_ms": 40002.5113,
|
|
"latency_p95_ms": 40002.5113,
|
|
"latency_p99_ms": 40002.5113,
|
|
"consistency": 1.0,
|
|
"total": 1,
|
|
"passed": 0,
|
|
"failed": 1,
|
|
"accuracy_mean": 0.0,
|
|
"accuracy_std": 0.0,
|
|
"ci_lower": 0.0,
|
|
"ci_upper": 0.7935
|
|
},
|
|
"error_recovery": {
|
|
"accuracy": 1.0,
|
|
"precision": 0.0,
|
|
"recall": 0.0,
|
|
"f1": 0.0,
|
|
"latency_p50_ms": 35309.3238,
|
|
"latency_p95_ms": 35309.3238,
|
|
"latency_p99_ms": 35309.3238,
|
|
"consistency": 1.0,
|
|
"total": 1,
|
|
"passed": 1,
|
|
"failed": 0,
|
|
"accuracy_mean": 1.0,
|
|
"accuracy_std": 0.0,
|
|
"ci_lower": 0.2065,
|
|
"ci_upper": 1.0
|
|
}
|
|
},
|
|
"by_difficulty": {
|
|
"easy": {
|
|
"accuracy": 0.0,
|
|
"precision": 0.0,
|
|
"recall": 0.0,
|
|
"f1": 0.0,
|
|
"latency_p50_ms": 20004.7078,
|
|
"latency_p95_ms": 20004.7078,
|
|
"latency_p99_ms": 20004.7078,
|
|
"consistency": 1.0,
|
|
"total": 1,
|
|
"passed": 0,
|
|
"failed": 1,
|
|
"accuracy_mean": 0.0,
|
|
"accuracy_std": 0.0,
|
|
"ci_lower": 0.0,
|
|
"ci_upper": 0.7935
|
|
},
|
|
"medium": {
|
|
"accuracy": 0.5,
|
|
"precision": 0.0,
|
|
"recall": 0.0,
|
|
"f1": 0.0,
|
|
"latency_p50_ms": 22670.6786,
|
|
"latency_p95_ms": 38269.328,
|
|
"latency_p99_ms": 39655.8746,
|
|
"consistency": 1.0,
|
|
"total": 2,
|
|
"passed": 1,
|
|
"failed": 1,
|
|
"accuracy_mean": 0.5,
|
|
"accuracy_std": 0.0,
|
|
"ci_lower": 0.0945,
|
|
"ci_upper": 0.9055
|
|
},
|
|
"hard": {
|
|
"accuracy": 1.0,
|
|
"precision": 0.0,
|
|
"recall": 0.0,
|
|
"f1": 0.0,
|
|
"latency_p50_ms": 38719.5889,
|
|
"latency_p95_ms": 41788.8276,
|
|
"latency_p99_ms": 42061.6488,
|
|
"consistency": 1.0,
|
|
"total": 2,
|
|
"passed": 2,
|
|
"failed": 0,
|
|
"accuracy_mean": 1.0,
|
|
"accuracy_std": 0.0,
|
|
"ci_lower": 0.3424,
|
|
"ci_upper": 1.0
|
|
}
|
|
},
|
|
"cases": [
|
|
{
|
|
"task_id": "llm-001",
|
|
"dimension": "llm_reasoning",
|
|
"category": "intent_understanding",
|
|
"difficulty": "easy",
|
|
"passed": false,
|
|
"expected": "react",
|
|
"actual": "timeout",
|
|
"duration_ms": 20004.7078,
|
|
"root_cause": "timeout",
|
|
"detail": "LLM call timed out after 20.0s",
|
|
"consistency": 1.0
|
|
},
|
|
{
|
|
"task_id": "llm-002",
|
|
"dimension": "llm_reasoning",
|
|
"category": "tool_selection",
|
|
"difficulty": "medium",
|
|
"passed": true,
|
|
"expected": "react",
|
|
"actual": "mode=react tokens=268 len=109",
|
|
"duration_ms": 5338.8459,
|
|
"root_cause": "none",
|
|
"detail": "mode=react keywords=['search', '搜索', 'web', '论文', 'paper', 'agent'] stream=False",
|
|
"consistency": 1.0
|
|
},
|
|
{
|
|
"task_id": "llm-003",
|
|
"dimension": "llm_reasoning",
|
|
"category": "multi_step",
|
|
"difficulty": "hard",
|
|
"passed": true,
|
|
"expected": "react",
|
|
"actual": "mode=react tokens=0 len=31",
|
|
"duration_ms": 42129.8541,
|
|
"root_cause": "none",
|
|
"detail": "mode=react keywords=['fib', '递归', '优化', '缓存', 'memo', '迭代', '动态规划', '性能'] stream=True",
|
|
"consistency": 1.0
|
|
},
|
|
{
|
|
"task_id": "llm-004",
|
|
"dimension": "llm_reasoning",
|
|
"category": "code_generation",
|
|
"difficulty": "medium",
|
|
"passed": false,
|
|
"expected": "react",
|
|
"actual": "timeout",
|
|
"duration_ms": 40002.5113,
|
|
"root_cause": "timeout",
|
|
"detail": "LLM call timed out after 40.0s",
|
|
"consistency": 1.0
|
|
},
|
|
{
|
|
"task_id": "llm-005",
|
|
"dimension": "llm_reasoning",
|
|
"category": "error_recovery",
|
|
"difficulty": "hard",
|
|
"passed": true,
|
|
"expected": "react",
|
|
"actual": "mode=react tokens=0 len=54",
|
|
"duration_ms": 35309.3238,
|
|
"root_cause": "none",
|
|
"detail": "mode=react keywords=['pip', 'install', 'agentkit', '安装', '模块'] stream=True",
|
|
"consistency": 1.0
|
|
}
|
|
]
|
|
}
|
|
}
|
|
} |