{ "report_type": "comprehensive_capability_backtest", "generated_at": "2026-06-17T05:29:48.993554+00:00", "total_score": 100.0, "total_cases": 50, "total_passed": 50, "dimension_scores": { "preprocessing_accuracy": 100.0, "skill_recall": 100.0, "overfitting_detection": 100.0, "execution_efficiency": 100.0, "tool_search_accuracy": 100.0, "event_model_integrity": 100.0, "spec_management": 100.0, "verification_loop": 100.0 }, "dimension_details": { "preprocessing_accuracy": { "total": 17, "passed": 17, "score": 100.0, "cases": [ { "case_id": "greeting_cn", "passed": true, "input": "你好", "expected": "direct_chat", "actual": "direct_chat" }, { "case_id": "greeting_en", "passed": true, "input": "hello", "expected": "direct_chat", "actual": "direct_chat" }, { "case_id": "greeting_hi", "passed": true, "input": "hi", "expected": "direct_chat", "actual": "direct_chat" }, { "case_id": "chitchat_thanks", "passed": true, "input": "谢谢", "expected": "direct_chat", "actual": "direct_chat" }, { "case_id": "chitchat_ok", "passed": true, "input": "好的", "expected": "direct_chat", "actual": "direct_chat" }, { "case_id": "identity_who", "passed": true, "input": "你是谁", "expected": "direct_chat", "actual": "direct_chat" }, { "case_id": "identity_name", "passed": true, "input": "你叫什么", "expected": "direct_chat", "actual": "direct_chat" }, { "case_id": "tool_ip", "passed": true, "input": "查下ip", "expected": "react", "actual": "react" }, { "case_id": "tool_search", "passed": true, "input": "搜索golang教程", "expected": "react", "actual": "react" }, { "case_id": "tool_shell", "passed": true, "input": "执行ls命令", "expected": "react", "actual": "react" }, { "case_id": "tool_file", "passed": true, "input": "读一下配置文件", "expected": "react", "actual": "react" }, { "case_id": "tool_monitor", "passed": true, "input": "检查服务状态", "expected": "react", "actual": "react" }, { "case_id": "complex_analysis", "passed": true, "input": "帮我分析一下这个数据并生成报告", "expected": "react", "actual": "react" }, { "case_id": "complex_code", "passed": true, "input": "重构这个函数使其更高效", "expected": "react", "actual": "react" }, { "case_id": "complex_multi", "passed": true, "input": "搜索最新的AI论文并总结关键发现", "expected": "react", "actual": "react" }, { "case_id": "skill_prefix_react", "passed": true, "input": "@skill:react_agent 查看当前ip", "expected": "skill_react", "actual": "skill_react" }, { "case_id": "skill_prefix_coder", "passed": true, "input": "@skill:coder 写一个函数", "expected": "skill_react", "actual": "skill_react" } ] }, "skill_recall": { "total": 8, "passed": 8, "score": 100.0, "cases": [ { "case_id": "recall_valid_react", "passed": true }, { "case_id": "recall_valid_coder", "passed": true }, { "case_id": "recall_invalid_skill", "passed": true }, { "case_id": "recall_no_prefix_react", "passed": true }, { "case_id": "recall_no_prefix_greeting", "passed": true }, { "case_id": "recall_no_prefix_complex", "passed": true }, { "case_id": "recall_skill_only_prefix", "passed": true }, { "case_id": "recall_skill_with_long_content", "passed": true } ] }, "overfitting_detection": { "total": 5, "passed": 5, "score": 100.0, "cases": [ { "case_id": "overfit_ip_check", "passed": true }, { "case_id": "overfit_search", "passed": true }, { "case_id": "overfit_greeting", "passed": true }, { "case_id": "overfit_file_read", "passed": true }, { "case_id": "overfit_identity", "passed": true } ] }, "execution_efficiency": { "total": 5, "passed": 5, "score": 100.0, "cases": [ { "case_id": "efficiency_greeting", "passed": true, "elapsed_ms": 0.39 }, { "case_id": "efficiency_chitchat", "passed": true, "elapsed_ms": 0.38 }, { "case_id": "efficiency_identity", "passed": true, "elapsed_ms": 0.34 }, { "case_id": "efficiency_react_tool", "passed": true, "elapsed_ms": 0.33 }, { "case_id": "efficiency_react_complex", "passed": true, "elapsed_ms": 0.33 } ] }, "tool_search_accuracy": { "total": 8, "passed": 8, "score": 100.0, "cases": [ { "case_id": "tool_search_read", "passed": true }, { "case_id": "tool_search_write", "passed": true }, { "case_id": "tool_search_web", "passed": true }, { "case_id": "tool_search_shell", "passed": true }, { "case_id": "tool_search_tests", "passed": true }, { "case_id": "tool_search_file_multiple", "passed": true }, { "case_id": "tool_search_no_match", "passed": true }, { "case_id": "tool_search_empty_query", "passed": true } ] }, "event_model_integrity": { "total": 3, "passed": 3, "score": 100.0, "cases": [ { "case_id": "sq_submit_and_drain", "passed": true }, { "case_id": "eq_emit_and_subscribe", "passed": true }, { "case_id": "event_type_classification", "passed": true } ] }, "spec_management": { "total": 2, "passed": 2, "score": 100.0, "cases": [ { "case_id": "spec_create_and_get", "passed": true }, { "case_id": "spec_confirm", "passed": true } ] }, "verification_loop": { "total": 2, "passed": 2, "score": 100.0, "cases": [ { "case_id": "verify_success", "passed": true }, { "case_id": "verify_failure", "passed": true } ] } }, "suggestions": [ "所有维度均达到 100%,架构状态良好" ] }