{ "timestamp": "2026-06-20T11:05:39.446588+00:00", "version": "0.1.0", "mode": "llm", "runs": 3, "fast": false, "overall_accuracy": 0.8, "overall_accuracy_mean": 0.9333, "overall_accuracy_std": 0.0, "summary": "4/5 tests passed (1 failed) across 1 dimensions.", "dimensions": { "llm_reasoning": { "metrics": { "accuracy": 0.8, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 40798.4485, "latency_p95_ms": 56307.9299, "latency_p99_ms": 59262.5279, "consistency": 1.0, "total": 5, "passed": 4, "failed": 1, "accuracy_mean": 0.9333, "accuracy_std": 0.0943, "ci_lower": 0.3755, "ci_upper": 0.9638 }, "by_category": { "intent_understanding": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 32004.2511, "latency_p95_ms": 32004.2511, "latency_p99_ms": 32004.2511, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "tool_selection": { "accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 60001.1774, "latency_p95_ms": 60001.1774, "latency_p99_ms": 60001.1774, "consistency": 1.0, "total": 1, "passed": 0, "failed": 1, "accuracy_mean": 0.0, "accuracy_std": 0.0, "ci_lower": 0.0, "ci_upper": 0.7935 }, "multi_step": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 36994.9937, "latency_p95_ms": 36994.9937, "latency_p99_ms": 36994.9937, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "code_generation": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 41534.9401, "latency_p95_ms": 41534.9401, "latency_p99_ms": 41534.9401, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "error_recovery": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 40798.4485, "latency_p95_ms": 40798.4485, "latency_p99_ms": 40798.4485, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 32004.2511, "latency_p95_ms": 32004.2511, "latency_p99_ms": 32004.2511, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "medium": { "accuracy": 0.5, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 50768.0587, "latency_p95_ms": 59077.8655, "latency_p99_ms": 59816.515, "consistency": 1.0, "total": 2, "passed": 1, "failed": 1, "accuracy_mean": 0.5, "accuracy_std": 0.0, "ci_lower": 0.0945, "ci_upper": 0.9055 }, "hard": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 38896.7211, "latency_p95_ms": 40608.2758, "latency_p99_ms": 40760.414, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 } }, "cases": [ { "task_id": "llm-001", "dimension": "llm_reasoning", "category": "intent_understanding", "difficulty": "easy", "passed": true, "expected": "react", "actual": "mode=react tokens=1249 len=895", "duration_ms": 32004.2511, "root_cause": "none", "detail": "mode=react keywords=['ip', '地址', 'ifconfig', 'hostname', '网络'] stream=False", "consistency": 1.0 }, { "task_id": "llm-002", "dimension": "llm_reasoning", "category": "tool_selection", "difficulty": "medium", "passed": false, "expected": "react", "actual": "timeout", "duration_ms": 60001.1774, "root_cause": "timeout", "detail": "LLM call timed out after 60.0s", "consistency": 1.0 }, { "task_id": "llm-003", "dimension": "llm_reasoning", "category": "multi_step", "difficulty": "hard", "passed": true, "expected": "react", "actual": "mode=react tokens=0 len=28", "duration_ms": 36994.9937, "root_cause": "none", "detail": "mode=react keywords=['fib', '递归', '优化', '缓存', 'memo', '迭代', '动态规划', '性能'] stream=True", "consistency": 1.0 }, { "task_id": "llm-004", "dimension": "llm_reasoning", "category": "code_generation", "difficulty": "medium", "passed": true, "expected": "react", "actual": "mode=react tokens=2103 len=1517", "duration_ms": 41534.9401, "root_cause": "none", "detail": "mode=react keywords=['def', 'fib', 'return', 'python'] stream=False", "consistency": 1.0 }, { "task_id": "llm-005", "dimension": "llm_reasoning", "category": "error_recovery", "difficulty": "hard", "passed": true, "expected": "react", "actual": "mode=react tokens=0 len=52", "duration_ms": 40798.4485, "root_cause": "none", "detail": "mode=react keywords=['pip', 'install', 'agentkit', '安装', '模块'] stream=True", "consistency": 1.0 } ] } } }