fischer-agentkit/test-results/e2e/comprehensive_report.json

334 lines
7.7 KiB
JSON

{
"report_type": "comprehensive_capability_backtest",
"generated_at": "2026-06-17T05:29:48.993554+00:00",
"total_score": 100.0,
"total_cases": 50,
"total_passed": 50,
"dimension_scores": {
"preprocessing_accuracy": 100.0,
"skill_recall": 100.0,
"overfitting_detection": 100.0,
"execution_efficiency": 100.0,
"tool_search_accuracy": 100.0,
"event_model_integrity": 100.0,
"spec_management": 100.0,
"verification_loop": 100.0
},
"dimension_details": {
"preprocessing_accuracy": {
"total": 17,
"passed": 17,
"score": 100.0,
"cases": [
{
"case_id": "greeting_cn",
"passed": true,
"input": "你好",
"expected": "direct_chat",
"actual": "direct_chat"
},
{
"case_id": "greeting_en",
"passed": true,
"input": "hello",
"expected": "direct_chat",
"actual": "direct_chat"
},
{
"case_id": "greeting_hi",
"passed": true,
"input": "hi",
"expected": "direct_chat",
"actual": "direct_chat"
},
{
"case_id": "chitchat_thanks",
"passed": true,
"input": "谢谢",
"expected": "direct_chat",
"actual": "direct_chat"
},
{
"case_id": "chitchat_ok",
"passed": true,
"input": "好的",
"expected": "direct_chat",
"actual": "direct_chat"
},
{
"case_id": "identity_who",
"passed": true,
"input": "你是谁",
"expected": "direct_chat",
"actual": "direct_chat"
},
{
"case_id": "identity_name",
"passed": true,
"input": "你叫什么",
"expected": "direct_chat",
"actual": "direct_chat"
},
{
"case_id": "tool_ip",
"passed": true,
"input": "查下ip",
"expected": "react",
"actual": "react"
},
{
"case_id": "tool_search",
"passed": true,
"input": "搜索golang教程",
"expected": "react",
"actual": "react"
},
{
"case_id": "tool_shell",
"passed": true,
"input": "执行ls命令",
"expected": "react",
"actual": "react"
},
{
"case_id": "tool_file",
"passed": true,
"input": "读一下配置文件",
"expected": "react",
"actual": "react"
},
{
"case_id": "tool_monitor",
"passed": true,
"input": "检查服务状态",
"expected": "react",
"actual": "react"
},
{
"case_id": "complex_analysis",
"passed": true,
"input": "帮我分析一下这个数据并生成报告",
"expected": "react",
"actual": "react"
},
{
"case_id": "complex_code",
"passed": true,
"input": "重构这个函数使其更高效",
"expected": "react",
"actual": "react"
},
{
"case_id": "complex_multi",
"passed": true,
"input": "搜索最新的AI论文并总结关键发现",
"expected": "react",
"actual": "react"
},
{
"case_id": "skill_prefix_react",
"passed": true,
"input": "@skill:react_agent 查看当前ip",
"expected": "skill_react",
"actual": "skill_react"
},
{
"case_id": "skill_prefix_coder",
"passed": true,
"input": "@skill:coder 写一个函数",
"expected": "skill_react",
"actual": "skill_react"
}
]
},
"skill_recall": {
"total": 8,
"passed": 8,
"score": 100.0,
"cases": [
{
"case_id": "recall_valid_react",
"passed": true
},
{
"case_id": "recall_valid_coder",
"passed": true
},
{
"case_id": "recall_invalid_skill",
"passed": true
},
{
"case_id": "recall_no_prefix_react",
"passed": true
},
{
"case_id": "recall_no_prefix_greeting",
"passed": true
},
{
"case_id": "recall_no_prefix_complex",
"passed": true
},
{
"case_id": "recall_skill_only_prefix",
"passed": true
},
{
"case_id": "recall_skill_with_long_content",
"passed": true
}
]
},
"overfitting_detection": {
"total": 5,
"passed": 5,
"score": 100.0,
"cases": [
{
"case_id": "overfit_ip_check",
"passed": true
},
{
"case_id": "overfit_search",
"passed": true
},
{
"case_id": "overfit_greeting",
"passed": true
},
{
"case_id": "overfit_file_read",
"passed": true
},
{
"case_id": "overfit_identity",
"passed": true
}
]
},
"execution_efficiency": {
"total": 5,
"passed": 5,
"score": 100.0,
"cases": [
{
"case_id": "efficiency_greeting",
"passed": true,
"elapsed_ms": 0.39
},
{
"case_id": "efficiency_chitchat",
"passed": true,
"elapsed_ms": 0.38
},
{
"case_id": "efficiency_identity",
"passed": true,
"elapsed_ms": 0.34
},
{
"case_id": "efficiency_react_tool",
"passed": true,
"elapsed_ms": 0.33
},
{
"case_id": "efficiency_react_complex",
"passed": true,
"elapsed_ms": 0.33
}
]
},
"tool_search_accuracy": {
"total": 8,
"passed": 8,
"score": 100.0,
"cases": [
{
"case_id": "tool_search_read",
"passed": true
},
{
"case_id": "tool_search_write",
"passed": true
},
{
"case_id": "tool_search_web",
"passed": true
},
{
"case_id": "tool_search_shell",
"passed": true
},
{
"case_id": "tool_search_tests",
"passed": true
},
{
"case_id": "tool_search_file_multiple",
"passed": true
},
{
"case_id": "tool_search_no_match",
"passed": true
},
{
"case_id": "tool_search_empty_query",
"passed": true
}
]
},
"event_model_integrity": {
"total": 3,
"passed": 3,
"score": 100.0,
"cases": [
{
"case_id": "sq_submit_and_drain",
"passed": true
},
{
"case_id": "eq_emit_and_subscribe",
"passed": true
},
{
"case_id": "event_type_classification",
"passed": true
}
]
},
"spec_management": {
"total": 2,
"passed": 2,
"score": 100.0,
"cases": [
{
"case_id": "spec_create_and_get",
"passed": true
},
{
"case_id": "spec_confirm",
"passed": true
}
]
},
"verification_loop": {
"total": 2,
"passed": 2,
"score": 100.0,
"cases": [
{
"case_id": "verify_success",
"passed": true
},
{
"case_id": "verify_failure",
"passed": true
}
]
}
},
"suggestions": [
"所有维度均达到 100%,架构状态良好"
]
}