fischer-agentkit/test-results/benchmark/benchmark_report.json

{
  "timestamp": "2026-06-20T11:05:39.446588+00:00",
  "version": "0.1.0",
  "mode": "llm",
  "runs": 3,
  "fast": false,
  "overall_accuracy": 0.8,
  "overall_accuracy_mean": 0.9333,
  "overall_accuracy_std": 0.0,
  "summary": "4/5 tests passed (1 failed) across 1 dimensions.",
  "dimensions": {
    "llm_reasoning": {
      "metrics": {
        "accuracy": 0.8,
        "precision": 0.0,
        "recall": 0.0,
        "f1": 0.0,
        "latency_p50_ms": 40798.4485,
        "latency_p95_ms": 56307.9299,
        "latency_p99_ms": 59262.5279,
        "consistency": 1.0,
        "total": 5,
        "passed": 4,
        "failed": 1,
        "accuracy_mean": 0.9333,
        "accuracy_std": 0.0943,
        "ci_lower": 0.3755,
        "ci_upper": 0.9638
      },
      "by_category": {
        "intent_understanding": {
          "accuracy": 1.0,
          "precision": 0.0,
          "recall": 0.0,
          "f1": 0.0,
          "latency_p50_ms": 32004.2511,
          "latency_p95_ms": 32004.2511,
          "latency_p99_ms": 32004.2511,
          "consistency": 1.0,
          "total": 1,
          "passed": 1,
          "failed": 0,
          "accuracy_mean": 1.0,
          "accuracy_std": 0.0,
          "ci_lower": 0.2065,
          "ci_upper": 1.0
        },
        "tool_selection": {
          "accuracy": 0.0,
          "precision": 0.0,
          "recall": 0.0,
          "f1": 0.0,
          "latency_p50_ms": 60001.1774,
          "latency_p95_ms": 60001.1774,
          "latency_p99_ms": 60001.1774,
          "consistency": 1.0,
          "total": 1,
          "passed": 0,
          "failed": 1,
          "accuracy_mean": 0.0,
          "accuracy_std": 0.0,
          "ci_lower": 0.0,
          "ci_upper": 0.7935
        },
        "multi_step": {
          "accuracy": 1.0,
          "precision": 0.0,
          "recall": 0.0,
          "f1": 0.0,
          "latency_p50_ms": 36994.9937,
          "latency_p95_ms": 36994.9937,
          "latency_p99_ms": 36994.9937,
          "consistency": 1.0,
          "total": 1,
          "passed": 1,
          "failed": 0,
          "accuracy_mean": 1.0,
          "accuracy_std": 0.0,
          "ci_lower": 0.2065,
          "ci_upper": 1.0
        },
        "code_generation": {
          "accuracy": 1.0,
          "precision": 0.0,
          "recall": 0.0,
          "f1": 0.0,
          "latency_p50_ms": 41534.9401,
          "latency_p95_ms": 41534.9401,
          "latency_p99_ms": 41534.9401,
          "consistency": 1.0,
          "total": 1,
          "passed": 1,
          "failed": 0,
          "accuracy_mean": 1.0,
          "accuracy_std": 0.0,
          "ci_lower": 0.2065,
          "ci_upper": 1.0
        },
        "error_recovery": {
          "accuracy": 1.0,
          "precision": 0.0,
          "recall": 0.0,
          "f1": 0.0,
          "latency_p50_ms": 40798.4485,
          "latency_p95_ms": 40798.4485,
          "latency_p99_ms": 40798.4485,
          "consistency": 1.0,
          "total": 1,
          "passed": 1,
          "failed": 0,
          "accuracy_mean": 1.0,
          "accuracy_std": 0.0,
          "ci_lower": 0.2065,
          "ci_upper": 1.0
        }
      },
      "by_difficulty": {
        "easy": {
          "accuracy": 1.0,
          "precision": 0.0,
          "recall": 0.0,
          "f1": 0.0,
          "latency_p50_ms": 32004.2511,
          "latency_p95_ms": 32004.2511,
          "latency_p99_ms": 32004.2511,
          "consistency": 1.0,
          "total": 1,
          "passed": 1,
          "failed": 0,
          "accuracy_mean": 1.0,
          "accuracy_std": 0.0,
          "ci_lower": 0.2065,
          "ci_upper": 1.0
        },
        "medium": {
          "accuracy": 0.5,
          "precision": 0.0,
          "recall": 0.0,
          "f1": 0.0,
          "latency_p50_ms": 50768.0587,
          "latency_p95_ms": 59077.8655,
          "latency_p99_ms": 59816.515,
          "consistency": 1.0,
          "total": 2,
          "passed": 1,
          "failed": 1,
          "accuracy_mean": 0.5,
          "accuracy_std": 0.0,
          "ci_lower": 0.0945,
          "ci_upper": 0.9055
        },
        "hard": {
          "accuracy": 1.0,
          "precision": 0.0,
          "recall": 0.0,
          "f1": 0.0,
          "latency_p50_ms": 38896.7211,
          "latency_p95_ms": 40608.2758,
          "latency_p99_ms": 40760.414,
          "consistency": 1.0,
          "total": 2,
          "passed": 2,
          "failed": 0,
          "accuracy_mean": 1.0,
          "accuracy_std": 0.0,
          "ci_lower": 0.3424,
          "ci_upper": 1.0
        }
      },
      "cases": [
        {
          "task_id": "llm-001",
          "dimension": "llm_reasoning",
          "category": "intent_understanding",
          "difficulty": "easy",
          "passed": true,
          "expected": "react",
          "actual": "mode=react tokens=1249 len=895",
          "duration_ms": 32004.2511,
          "root_cause": "none",
          "detail": "mode=react keywords=['ip', '地址', 'ifconfig', 'hostname', '网络'] stream=False",
          "consistency": 1.0
        },
        {
          "task_id": "llm-002",
          "dimension": "llm_reasoning",
          "category": "tool_selection",
          "difficulty": "medium",
          "passed": false,
          "expected": "react",
          "actual": "timeout",
          "duration_ms": 60001.1774,
          "root_cause": "timeout",
          "detail": "LLM call timed out after 60.0s",
          "consistency": 1.0
        },
        {
          "task_id": "llm-003",
          "dimension": "llm_reasoning",
          "category": "multi_step",
          "difficulty": "hard",
          "passed": true,
          "expected": "react",
          "actual": "mode=react tokens=0 len=28",
          "duration_ms": 36994.9937,
          "root_cause": "none",
          "detail": "mode=react keywords=['fib', '递归', '优化', '缓存', 'memo', '迭代', '动态规划', '性能'] stream=True",
          "consistency": 1.0
        },
        {
          "task_id": "llm-004",
          "dimension": "llm_reasoning",
          "category": "code_generation",
          "difficulty": "medium",
          "passed": true,
          "expected": "react",
          "actual": "mode=react tokens=2103 len=1517",
          "duration_ms": 41534.9401,
          "root_cause": "none",
          "detail": "mode=react keywords=['def', 'fib', 'return', 'python'] stream=False",
          "consistency": 1.0
        },
        {
          "task_id": "llm-005",
          "dimension": "llm_reasoning",
          "category": "error_recovery",
          "difficulty": "hard",
          "passed": true,
          "expected": "react",
          "actual": "mode=react tokens=0 len=52",
          "duration_ms": 40798.4485,
          "root_cause": "none",
          "detail": "mode=react keywords=['pip', 'install', 'agentkit', '安装', '模块'] stream=True",
          "consistency": 1.0
        }
      ]
    }
  }
}