{ "timestamp": "2026-06-20T03:18:35.937935+00:00", "version": "0.1.0", "mode": "llm", "runs": 1, "fast": false, "overall_accuracy": 0.6, "overall_accuracy_mean": 0.6, "overall_accuracy_std": 0.0, "summary": "3/5 tests passed (2 failed) across 1 dimensions.", "dimensions": { "llm_reasoning": { "metrics": { "accuracy": 0.6, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 35309.3238, "latency_p95_ms": 41704.3855, "latency_p99_ms": 42044.7604, "consistency": 1.0, "total": 5, "passed": 3, "failed": 2, "accuracy_mean": 0.6, "accuracy_std": 0.0, "ci_lower": 0.2307, "ci_upper": 0.8824 }, "by_category": { "intent_understanding": { "accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 20004.7078, "latency_p95_ms": 20004.7078, "latency_p99_ms": 20004.7078, "consistency": 1.0, "total": 1, "passed": 0, "failed": 1, "accuracy_mean": 0.0, "accuracy_std": 0.0, "ci_lower": 0.0, "ci_upper": 0.7935 }, "tool_selection": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 5338.8459, "latency_p95_ms": 5338.8459, "latency_p99_ms": 5338.8459, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "multi_step": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 42129.8541, "latency_p95_ms": 42129.8541, "latency_p99_ms": 42129.8541, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 }, "code_generation": { "accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 40002.5113, "latency_p95_ms": 40002.5113, "latency_p99_ms": 40002.5113, "consistency": 1.0, "total": 1, "passed": 0, "failed": 1, "accuracy_mean": 0.0, "accuracy_std": 0.0, "ci_lower": 0.0, "ci_upper": 0.7935 }, "error_recovery": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 35309.3238, "latency_p95_ms": 35309.3238, "latency_p99_ms": 35309.3238, "consistency": 1.0, "total": 1, "passed": 1, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.2065, "ci_upper": 1.0 } }, "by_difficulty": { "easy": { "accuracy": 0.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 20004.7078, "latency_p95_ms": 20004.7078, "latency_p99_ms": 20004.7078, "consistency": 1.0, "total": 1, "passed": 0, "failed": 1, "accuracy_mean": 0.0, "accuracy_std": 0.0, "ci_lower": 0.0, "ci_upper": 0.7935 }, "medium": { "accuracy": 0.5, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 22670.6786, "latency_p95_ms": 38269.328, "latency_p99_ms": 39655.8746, "consistency": 1.0, "total": 2, "passed": 1, "failed": 1, "accuracy_mean": 0.5, "accuracy_std": 0.0, "ci_lower": 0.0945, "ci_upper": 0.9055 }, "hard": { "accuracy": 1.0, "precision": 0.0, "recall": 0.0, "f1": 0.0, "latency_p50_ms": 38719.5889, "latency_p95_ms": 41788.8276, "latency_p99_ms": 42061.6488, "consistency": 1.0, "total": 2, "passed": 2, "failed": 0, "accuracy_mean": 1.0, "accuracy_std": 0.0, "ci_lower": 0.3424, "ci_upper": 1.0 } }, "cases": [ { "task_id": "llm-001", "dimension": "llm_reasoning", "category": "intent_understanding", "difficulty": "easy", "passed": false, "expected": "react", "actual": "timeout", "duration_ms": 20004.7078, "root_cause": "timeout", "detail": "LLM call timed out after 20.0s", "consistency": 1.0 }, { "task_id": "llm-002", "dimension": "llm_reasoning", "category": "tool_selection", "difficulty": "medium", "passed": true, "expected": "react", "actual": "mode=react tokens=268 len=109", "duration_ms": 5338.8459, "root_cause": "none", "detail": "mode=react keywords=['search', '搜索', 'web', '论文', 'paper', 'agent'] stream=False", "consistency": 1.0 }, { "task_id": "llm-003", "dimension": "llm_reasoning", "category": "multi_step", "difficulty": "hard", "passed": true, "expected": "react", "actual": "mode=react tokens=0 len=31", "duration_ms": 42129.8541, "root_cause": "none", "detail": "mode=react keywords=['fib', '递归', '优化', '缓存', 'memo', '迭代', '动态规划', '性能'] stream=True", "consistency": 1.0 }, { "task_id": "llm-004", "dimension": "llm_reasoning", "category": "code_generation", "difficulty": "medium", "passed": false, "expected": "react", "actual": "timeout", "duration_ms": 40002.5113, "root_cause": "timeout", "detail": "LLM call timed out after 40.0s", "consistency": 1.0 }, { "task_id": "llm-005", "dimension": "llm_reasoning", "category": "error_recovery", "difficulty": "hard", "passed": true, "expected": "react", "actual": "mode=react tokens=0 len=54", "duration_ms": 35309.3238, "root_cause": "none", "detail": "mode=react keywords=['pip', 'install', 'agentkit', '安装', '模块'] stream=True", "consistency": 1.0 } ] } } }