334 lines
7.7 KiB
JSON
334 lines
7.7 KiB
JSON
{
|
|
"report_type": "comprehensive_capability_backtest",
|
|
"generated_at": "2026-06-17T03:22:42.152439+00:00",
|
|
"total_score": 100.0,
|
|
"total_cases": 50,
|
|
"total_passed": 50,
|
|
"dimension_scores": {
|
|
"preprocessing_accuracy": 100.0,
|
|
"skill_recall": 100.0,
|
|
"overfitting_detection": 100.0,
|
|
"execution_efficiency": 100.0,
|
|
"tool_search_accuracy": 100.0,
|
|
"event_model_integrity": 100.0,
|
|
"spec_management": 100.0,
|
|
"verification_loop": 100.0
|
|
},
|
|
"dimension_details": {
|
|
"preprocessing_accuracy": {
|
|
"total": 17,
|
|
"passed": 17,
|
|
"score": 100.0,
|
|
"cases": [
|
|
{
|
|
"case_id": "greeting_cn",
|
|
"passed": true,
|
|
"input": "你好",
|
|
"expected": "direct_chat",
|
|
"actual": "direct_chat"
|
|
},
|
|
{
|
|
"case_id": "greeting_en",
|
|
"passed": true,
|
|
"input": "hello",
|
|
"expected": "direct_chat",
|
|
"actual": "direct_chat"
|
|
},
|
|
{
|
|
"case_id": "greeting_hi",
|
|
"passed": true,
|
|
"input": "hi",
|
|
"expected": "direct_chat",
|
|
"actual": "direct_chat"
|
|
},
|
|
{
|
|
"case_id": "chitchat_thanks",
|
|
"passed": true,
|
|
"input": "谢谢",
|
|
"expected": "direct_chat",
|
|
"actual": "direct_chat"
|
|
},
|
|
{
|
|
"case_id": "chitchat_ok",
|
|
"passed": true,
|
|
"input": "好的",
|
|
"expected": "direct_chat",
|
|
"actual": "direct_chat"
|
|
},
|
|
{
|
|
"case_id": "identity_who",
|
|
"passed": true,
|
|
"input": "你是谁",
|
|
"expected": "direct_chat",
|
|
"actual": "direct_chat"
|
|
},
|
|
{
|
|
"case_id": "identity_name",
|
|
"passed": true,
|
|
"input": "你叫什么",
|
|
"expected": "direct_chat",
|
|
"actual": "direct_chat"
|
|
},
|
|
{
|
|
"case_id": "tool_ip",
|
|
"passed": true,
|
|
"input": "查下ip",
|
|
"expected": "react",
|
|
"actual": "react"
|
|
},
|
|
{
|
|
"case_id": "tool_search",
|
|
"passed": true,
|
|
"input": "搜索golang教程",
|
|
"expected": "react",
|
|
"actual": "react"
|
|
},
|
|
{
|
|
"case_id": "tool_shell",
|
|
"passed": true,
|
|
"input": "执行ls命令",
|
|
"expected": "react",
|
|
"actual": "react"
|
|
},
|
|
{
|
|
"case_id": "tool_file",
|
|
"passed": true,
|
|
"input": "读一下配置文件",
|
|
"expected": "react",
|
|
"actual": "react"
|
|
},
|
|
{
|
|
"case_id": "tool_monitor",
|
|
"passed": true,
|
|
"input": "检查服务状态",
|
|
"expected": "react",
|
|
"actual": "react"
|
|
},
|
|
{
|
|
"case_id": "complex_analysis",
|
|
"passed": true,
|
|
"input": "帮我分析一下这个数据并生成报告",
|
|
"expected": "react",
|
|
"actual": "react"
|
|
},
|
|
{
|
|
"case_id": "complex_code",
|
|
"passed": true,
|
|
"input": "重构这个函数使其更高效",
|
|
"expected": "react",
|
|
"actual": "react"
|
|
},
|
|
{
|
|
"case_id": "complex_multi",
|
|
"passed": true,
|
|
"input": "搜索最新的AI论文并总结关键发现",
|
|
"expected": "react",
|
|
"actual": "react"
|
|
},
|
|
{
|
|
"case_id": "skill_prefix_react",
|
|
"passed": true,
|
|
"input": "@skill:react_agent 查看当前ip",
|
|
"expected": "skill_react",
|
|
"actual": "skill_react"
|
|
},
|
|
{
|
|
"case_id": "skill_prefix_coder",
|
|
"passed": true,
|
|
"input": "@skill:coder 写一个函数",
|
|
"expected": "skill_react",
|
|
"actual": "skill_react"
|
|
}
|
|
]
|
|
},
|
|
"skill_recall": {
|
|
"total": 8,
|
|
"passed": 8,
|
|
"score": 100.0,
|
|
"cases": [
|
|
{
|
|
"case_id": "recall_valid_react",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "recall_valid_coder",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "recall_invalid_skill",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "recall_no_prefix_react",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "recall_no_prefix_greeting",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "recall_no_prefix_complex",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "recall_skill_only_prefix",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "recall_skill_with_long_content",
|
|
"passed": true
|
|
}
|
|
]
|
|
},
|
|
"overfitting_detection": {
|
|
"total": 5,
|
|
"passed": 5,
|
|
"score": 100.0,
|
|
"cases": [
|
|
{
|
|
"case_id": "overfit_ip_check",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "overfit_search",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "overfit_greeting",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "overfit_file_read",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "overfit_identity",
|
|
"passed": true
|
|
}
|
|
]
|
|
},
|
|
"execution_efficiency": {
|
|
"total": 5,
|
|
"passed": 5,
|
|
"score": 100.0,
|
|
"cases": [
|
|
{
|
|
"case_id": "efficiency_greeting",
|
|
"passed": true,
|
|
"elapsed_ms": 0.41
|
|
},
|
|
{
|
|
"case_id": "efficiency_chitchat",
|
|
"passed": true,
|
|
"elapsed_ms": 0.47
|
|
},
|
|
{
|
|
"case_id": "efficiency_identity",
|
|
"passed": true,
|
|
"elapsed_ms": 0.48
|
|
},
|
|
{
|
|
"case_id": "efficiency_react_tool",
|
|
"passed": true,
|
|
"elapsed_ms": 0.49
|
|
},
|
|
{
|
|
"case_id": "efficiency_react_complex",
|
|
"passed": true,
|
|
"elapsed_ms": 0.55
|
|
}
|
|
]
|
|
},
|
|
"tool_search_accuracy": {
|
|
"total": 8,
|
|
"passed": 8,
|
|
"score": 100.0,
|
|
"cases": [
|
|
{
|
|
"case_id": "tool_search_read",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "tool_search_write",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "tool_search_web",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "tool_search_shell",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "tool_search_tests",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "tool_search_file_multiple",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "tool_search_no_match",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "tool_search_empty_query",
|
|
"passed": true
|
|
}
|
|
]
|
|
},
|
|
"event_model_integrity": {
|
|
"total": 3,
|
|
"passed": 3,
|
|
"score": 100.0,
|
|
"cases": [
|
|
{
|
|
"case_id": "sq_submit_and_drain",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "eq_emit_and_subscribe",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "event_type_classification",
|
|
"passed": true
|
|
}
|
|
]
|
|
},
|
|
"spec_management": {
|
|
"total": 2,
|
|
"passed": 2,
|
|
"score": 100.0,
|
|
"cases": [
|
|
{
|
|
"case_id": "spec_create_and_get",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "spec_confirm",
|
|
"passed": true
|
|
}
|
|
]
|
|
},
|
|
"verification_loop": {
|
|
"total": 2,
|
|
"passed": 2,
|
|
"score": 100.0,
|
|
"cases": [
|
|
{
|
|
"case_id": "verify_success",
|
|
"passed": true
|
|
},
|
|
{
|
|
"case_id": "verify_failure",
|
|
"passed": true
|
|
}
|
|
]
|
|
}
|
|
},
|
|
"suggestions": [
|
|
"所有维度均达到 100%,架构状态良好"
|
|
]
|
|
} |