3213 lines
113 KiB
JSON
3213 lines
113 KiB
JSON
{
|
||
"generated_at": "2026-06-15T16:59:06.575194+00:00",
|
||
"total_observations": 82,
|
||
"overall_skill_recall": 0.9,
|
||
"overall_skill_precision": 0.9,
|
||
"overall_skill_f1": 0.9,
|
||
"overall_execution_mode_accuracy": 0.4038,
|
||
"overall_task_success_rate": 1.0,
|
||
"category_metrics": [
|
||
{
|
||
"category": "routing",
|
||
"subcategory": "explicit_prefix",
|
||
"total": 1,
|
||
"skill_correct": 1,
|
||
"skill_recall": 1.0,
|
||
"skill_precision": 1.0,
|
||
"skill_f1": 1.0,
|
||
"execution_mode_correct": 1,
|
||
"execution_mode_accuracy": 1.0,
|
||
"complexity_correct": 0,
|
||
"complexity_accuracy": 0.0,
|
||
"task_success_rate": 1.0,
|
||
"avg_response_time_ms": 0.05
|
||
},
|
||
{
|
||
"category": "routing",
|
||
"subcategory": "greeting",
|
||
"total": 2,
|
||
"skill_correct": 2,
|
||
"skill_recall": 1.0,
|
||
"skill_precision": 1.0,
|
||
"skill_f1": 1.0,
|
||
"execution_mode_correct": 2,
|
||
"execution_mode_accuracy": 1.0,
|
||
"complexity_correct": 2,
|
||
"complexity_accuracy": 1.0,
|
||
"task_success_rate": 1.0,
|
||
"avg_response_time_ms": 0.03
|
||
},
|
||
{
|
||
"category": "routing",
|
||
"subcategory": "identity",
|
||
"total": 1,
|
||
"skill_correct": 1,
|
||
"skill_recall": 1.0,
|
||
"skill_precision": 1.0,
|
||
"skill_f1": 1.0,
|
||
"execution_mode_correct": 1,
|
||
"execution_mode_accuracy": 1.0,
|
||
"complexity_correct": 1,
|
||
"complexity_accuracy": 1.0,
|
||
"task_success_rate": 1.0,
|
||
"avg_response_time_ms": 0.02
|
||
},
|
||
{
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"total": 62,
|
||
"skill_correct": 21,
|
||
"skill_recall": 0.6774,
|
||
"skill_precision": 0.6774,
|
||
"skill_f1": 0.6774,
|
||
"execution_mode_correct": 32,
|
||
"execution_mode_accuracy": 0.5161,
|
||
"complexity_correct": 22,
|
||
"complexity_accuracy": 0.3548,
|
||
"task_success_rate": 1.0,
|
||
"avg_response_time_ms": 4449.27
|
||
},
|
||
{
|
||
"category": "semantic_router",
|
||
"subcategory": "colloquial_match",
|
||
"total": 5,
|
||
"skill_correct": 5,
|
||
"skill_recall": 1.0,
|
||
"skill_precision": 1.0,
|
||
"skill_f1": 1.0,
|
||
"execution_mode_correct": 0,
|
||
"execution_mode_accuracy": 0.0,
|
||
"complexity_correct": 1,
|
||
"complexity_accuracy": 0.2,
|
||
"task_success_rate": 1.0,
|
||
"avg_response_time_ms": 2410.72
|
||
},
|
||
{
|
||
"category": "semantic_router",
|
||
"subcategory": "description_match",
|
||
"total": 8,
|
||
"skill_correct": 4,
|
||
"skill_recall": 1.0,
|
||
"skill_precision": 1.0,
|
||
"skill_f1": 1.0,
|
||
"execution_mode_correct": 3,
|
||
"execution_mode_accuracy": 0.375,
|
||
"complexity_correct": 2,
|
||
"complexity_accuracy": 0.25,
|
||
"task_success_rate": 1.0,
|
||
"avg_response_time_ms": 891.55
|
||
},
|
||
{
|
||
"category": "semantic_router",
|
||
"subcategory": "mixed_lang_match",
|
||
"total": 3,
|
||
"skill_correct": 2,
|
||
"skill_recall": 1.0,
|
||
"skill_precision": 1.0,
|
||
"skill_f1": 1.0,
|
||
"execution_mode_correct": 1,
|
||
"execution_mode_accuracy": 0.3333,
|
||
"complexity_correct": 2,
|
||
"complexity_accuracy": 0.6667,
|
||
"task_success_rate": 1.0,
|
||
"avg_response_time_ms": 0.89
|
||
}
|
||
],
|
||
"overfitting_results": [
|
||
{
|
||
"benchmark_id": "route-kw-direct-001",
|
||
"original_correct": true,
|
||
"paraphrase_results": [
|
||
true,
|
||
true,
|
||
true
|
||
],
|
||
"consistency_rate": 1.0,
|
||
"is_overfitted": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-002",
|
||
"original_correct": true,
|
||
"paraphrase_results": [
|
||
false,
|
||
true,
|
||
true
|
||
],
|
||
"consistency_rate": 0.6667,
|
||
"is_overfitted": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-003",
|
||
"original_correct": true,
|
||
"paraphrase_results": [
|
||
true,
|
||
false,
|
||
true
|
||
],
|
||
"consistency_rate": 0.6667,
|
||
"is_overfitted": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-001",
|
||
"original_correct": false,
|
||
"paraphrase_results": [
|
||
false,
|
||
false,
|
||
false
|
||
],
|
||
"consistency_rate": 1.0,
|
||
"is_overfitted": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-002",
|
||
"original_correct": false,
|
||
"paraphrase_results": [
|
||
false,
|
||
false,
|
||
true
|
||
],
|
||
"consistency_rate": 0.6667,
|
||
"is_overfitted": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-003",
|
||
"original_correct": false,
|
||
"paraphrase_results": [
|
||
false,
|
||
false,
|
||
false
|
||
],
|
||
"consistency_rate": 1.0,
|
||
"is_overfitted": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-rewoo-001",
|
||
"original_correct": false,
|
||
"paraphrase_results": [
|
||
true,
|
||
false,
|
||
false
|
||
],
|
||
"consistency_rate": 0.6667,
|
||
"is_overfitted": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-rewoo-002",
|
||
"original_correct": false,
|
||
"paraphrase_results": [
|
||
false,
|
||
false,
|
||
false
|
||
],
|
||
"consistency_rate": 1.0,
|
||
"is_overfitted": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-reflex-001",
|
||
"original_correct": false,
|
||
"paraphrase_results": [
|
||
false,
|
||
false,
|
||
false
|
||
],
|
||
"consistency_rate": 1.0,
|
||
"is_overfitted": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-reflex-002",
|
||
"original_correct": false,
|
||
"paraphrase_results": [
|
||
false,
|
||
false,
|
||
false
|
||
],
|
||
"consistency_rate": 1.0,
|
||
"is_overfitted": false
|
||
}
|
||
],
|
||
"overfitting_score": 0.2222,
|
||
"weaknesses": [
|
||
{
|
||
"dimension": "semantic_router",
|
||
"subcategory": "colloquial_match",
|
||
"severity": "high",
|
||
"description": "执行模式准确率过低 (0.00%),子类别: colloquial_match",
|
||
"evidence": "正确数=0/5",
|
||
"suggestion": "检查复杂度估算和模式选择逻辑"
|
||
},
|
||
{
|
||
"dimension": "semantic_router",
|
||
"subcategory": "description_match",
|
||
"severity": "high",
|
||
"description": "执行模式准确率过低 (37.50%),子类别: description_match",
|
||
"evidence": "正确数=3/8",
|
||
"suggestion": "检查复杂度估算和模式选择逻辑"
|
||
},
|
||
{
|
||
"dimension": "semantic_router",
|
||
"subcategory": "mixed_lang_match",
|
||
"severity": "high",
|
||
"description": "执行模式准确率过低 (33.33%),子类别: mixed_lang_match",
|
||
"evidence": "正确数=1/3",
|
||
"suggestion": "检查复杂度估算和模式选择逻辑"
|
||
},
|
||
{
|
||
"dimension": "routing",
|
||
"subcategory": "keyword_match",
|
||
"severity": "medium",
|
||
"description": "技能路由F1偏低 (0.68),子类别: keyword_match",
|
||
"evidence": "召回率=67.74%, 精确率=67.74%, 样本数=62",
|
||
"suggestion": "微调路由阈值或增加更多意图示例"
|
||
},
|
||
{
|
||
"dimension": "routing",
|
||
"subcategory": "keyword_match",
|
||
"severity": "medium",
|
||
"description": "执行模式准确率过低 (51.61%),子类别: keyword_match",
|
||
"evidence": "正确数=32/62",
|
||
"suggestion": "检查复杂度估算和模式选择逻辑"
|
||
}
|
||
],
|
||
"root_causes": [
|
||
{
|
||
"cause_type": "complexity_misjudge",
|
||
"cause_description": "复杂度估算偏差:倾向高估复杂度(将简单任务误判为需要多步推理)",
|
||
"confidence": 0.75,
|
||
"affected_cases": [
|
||
"route-kw-rewoo-001",
|
||
"route-kw-rewoo-002",
|
||
"route-kw-reflex-001",
|
||
"route-kw-reflex-002",
|
||
"route-kw-planexec-001",
|
||
"route-kw-coderev-001",
|
||
"route-kw-geo-001",
|
||
"route-kw-deai-001",
|
||
"route-kw-content-001",
|
||
"route-kw-citation-001"
|
||
],
|
||
"detail": "共 31 个执行模式判断错误。低估复杂度 0 次,高估复杂度 1 次。受影响子类别: description_match, mixed_lang_match, colloquial_match, keyword_match"
|
||
},
|
||
{
|
||
"cause_type": "intent_ambiguous",
|
||
"cause_description": "意图歧义:不同技能的关键词/意图描述重叠,导致路由混淆",
|
||
"confidence": 0.7,
|
||
"affected_cases": [
|
||
"route-kw-rewoo-001",
|
||
"route-kw-rewoo-001",
|
||
"route-kw-reflex-001"
|
||
],
|
||
"detail": "技能混淆对: rewoo_agent→competitor_analyzer(2次); reflexion_agent→code_reviewer(1次)"
|
||
},
|
||
{
|
||
"cause_type": "quality_threshold",
|
||
"cause_description": "质量门控阈值过低:任务虽成功完成但输出了错误结果",
|
||
"confidence": 0.6,
|
||
"affected_cases": [
|
||
"route-kw-rewoo-001",
|
||
"route-kw-rewoo-001",
|
||
"route-kw-reflex-001"
|
||
],
|
||
"detail": "共 3 个任务虽然HTTP成功但路由到了错误技能。质量门控未能拦截这些错误路由的结果。"
|
||
}
|
||
],
|
||
"improvement_plans": [
|
||
{
|
||
"weakness_description": "意图歧义:不同技能的关键词/意图描述重叠,导致路由混淆",
|
||
"root_causes": [
|
||
{
|
||
"cause_type": "intent_ambiguous",
|
||
"cause_description": "意图歧义:不同技能的关键词/意图描述重叠,导致路由混淆",
|
||
"confidence": 0.7,
|
||
"affected_cases": [
|
||
"route-kw-rewoo-001",
|
||
"route-kw-rewoo-001",
|
||
"route-kw-reflex-001"
|
||
],
|
||
"detail": "技能混淆对: rewoo_agent→competitor_analyzer(2次); reflexion_agent→code_reviewer(1次)"
|
||
}
|
||
],
|
||
"actions": [
|
||
{
|
||
"action_id": "ACT-001",
|
||
"title": "为易混淆技能添加互斥关键词",
|
||
"description": "在技能配置中为容易混淆的技能对添加互斥关键词(disambiguation_keywords),当用户输入同时匹配多个技能时,优先选择包含互斥关键词的技能。",
|
||
"target_module": "configs/skills/*.yaml → intent.disambiguation_keywords",
|
||
"priority": "P1",
|
||
"expected_impact": "预计提升精确率 10~25%,减少技能混淆",
|
||
"effort": "small",
|
||
"related_causes": [
|
||
"intent_ambiguous"
|
||
],
|
||
"verification": "运行歧义消解回测,验证路由精确率提升"
|
||
},
|
||
{
|
||
"action_id": "ACT-002",
|
||
"title": "实现LLM二次分类消歧",
|
||
"description": "当 Layer 0/1 路由到多个候选技能时,调用 LLM quick_classify 进行二次意图判断,选择最匹配的技能。",
|
||
"target_module": "src/agentkit/chat/skill_routing.py → Layer 1",
|
||
"priority": "P2",
|
||
"expected_impact": "预计提升精确率 15~30%,但增加 ~500ms 延迟和 ~100 tokens",
|
||
"effort": "medium",
|
||
"related_causes": [
|
||
"intent_ambiguous"
|
||
],
|
||
"verification": "运行歧义消解回测,对比延迟和精确率变化"
|
||
}
|
||
],
|
||
"overall_strategy": "短期:添加互斥关键词消歧;中期:启用LLM二次分类;长期:训练专用意图分类模型替代规则匹配"
|
||
},
|
||
{
|
||
"weakness_description": "复杂度估算偏差:倾向高估复杂度(将简单任务误判为需要多步推理)",
|
||
"root_causes": [
|
||
{
|
||
"cause_type": "complexity_misjudge",
|
||
"cause_description": "复杂度估算偏差:倾向高估复杂度(将简单任务误判为需要多步推理)",
|
||
"confidence": 0.75,
|
||
"affected_cases": [
|
||
"route-kw-rewoo-001",
|
||
"route-kw-rewoo-002",
|
||
"route-kw-reflex-001",
|
||
"route-kw-reflex-002",
|
||
"route-kw-planexec-001",
|
||
"route-kw-coderev-001",
|
||
"route-kw-geo-001",
|
||
"route-kw-deai-001",
|
||
"route-kw-content-001",
|
||
"route-kw-citation-001"
|
||
],
|
||
"detail": "共 31 个执行模式判断错误。低估复杂度 0 次,高估复杂度 1 次。受影响子类别: description_match, mixed_lang_match, colloquial_match, keyword_match"
|
||
}
|
||
],
|
||
"actions": [
|
||
{
|
||
"action_id": "ACT-003",
|
||
"title": "优化复杂度估算启发式规则",
|
||
"description": "调整 HeuristicClassifier 的复杂度评分权重:增加任务动词(分析/研究/设计)的权重,降低简单问答动词(是什么/多少)的权重。",
|
||
"target_module": "src/agentkit/chat/skill_routing.py → HeuristicClassifier",
|
||
"priority": "P1",
|
||
"expected_impact": "预计提升执行模式准确率 10~20%",
|
||
"effort": "small",
|
||
"related_causes": [
|
||
"complexity_misjudge"
|
||
],
|
||
"verification": "运行执行模式回测,验证准确率提升"
|
||
},
|
||
{
|
||
"action_id": "ACT-004",
|
||
"title": "引入任务复杂度校准数据集",
|
||
"description": "收集标注了复杂度等级的真实用户查询,构建校准数据集,定期评估和调整复杂度阈值。",
|
||
"target_module": "tests/e2e/benchmark_dataset.py",
|
||
"priority": "P2",
|
||
"expected_impact": "持续提升复杂度判断准确性",
|
||
"effort": "medium",
|
||
"related_causes": [
|
||
"complexity_misjudge"
|
||
],
|
||
"verification": "每次调整后运行回测,对比前后F1变化"
|
||
}
|
||
],
|
||
"overall_strategy": "短期:调整启发式规则权重;中期:构建复杂度校准数据集;长期:训练复杂度评估模型替代规则"
|
||
},
|
||
{
|
||
"weakness_description": "质量门控阈值过低:任务虽成功完成但输出了错误结果",
|
||
"root_causes": [
|
||
{
|
||
"cause_type": "quality_threshold",
|
||
"cause_description": "质量门控阈值过低:任务虽成功完成但输出了错误结果",
|
||
"confidence": 0.6,
|
||
"affected_cases": [
|
||
"route-kw-rewoo-001",
|
||
"route-kw-rewoo-001",
|
||
"route-kw-reflex-001"
|
||
],
|
||
"detail": "共 3 个任务虽然HTTP成功但路由到了错误技能。质量门控未能拦截这些错误路由的结果。"
|
||
}
|
||
],
|
||
"actions": [
|
||
{
|
||
"action_id": "ACT-005",
|
||
"title": "增强质量门控的技能匹配验证",
|
||
"description": "在QualityGate中增加技能匹配验证:检查输出是否与路由到的技能的能力范围一致,如果不一致则触发重试或降级。",
|
||
"target_module": "src/agentkit/quality/gate.py",
|
||
"priority": "P1",
|
||
"expected_impact": "减少错误路由导致的低质量输出",
|
||
"effort": "medium",
|
||
"related_causes": [
|
||
"quality_threshold"
|
||
],
|
||
"verification": "运行质量门控回测,验证错误路由拦截率"
|
||
}
|
||
],
|
||
"overall_strategy": "短期:增加技能匹配验证;中期:引入输出质量评分模型;长期:实现自动质量回归检测"
|
||
}
|
||
],
|
||
"raw_observations": [
|
||
{
|
||
"benchmark_id": "route-edge-greet-001",
|
||
"test_name": "layer0_route-edge-greet-001",
|
||
"timestamp": "2026-06-15T16:50:48.898798+00:00",
|
||
"input_query": "你好",
|
||
"is_paraphrase": false,
|
||
"expected_skill": null,
|
||
"expected_execution_mode": "direct",
|
||
"expected_complexity": "low",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "greeting",
|
||
"response_time_ms": 0.031158037018030882,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-edge-greet-002",
|
||
"test_name": "layer0_route-edge-greet-002",
|
||
"timestamp": "2026-06-15T16:50:48.900854+00:00",
|
||
"input_query": "Good morning!",
|
||
"is_paraphrase": false,
|
||
"expected_skill": null,
|
||
"expected_execution_mode": "direct",
|
||
"expected_complexity": "low",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "greeting",
|
||
"response_time_ms": 0.02913799835368991,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-edge-identity-001",
|
||
"test_name": "layer0_route-edge-identity-001",
|
||
"timestamp": "2026-06-15T16:50:48.902316+00:00",
|
||
"input_query": "你是谁?",
|
||
"is_paraphrase": false,
|
||
"expected_skill": null,
|
||
"expected_execution_mode": "direct",
|
||
"expected_complexity": "low",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "identity",
|
||
"response_time_ms": 0.020894978661090136,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-edge-explicit-001",
|
||
"test_name": "layer0_route-edge-explicit-001",
|
||
"timestamp": "2026-06-15T16:50:48.903641+00:00",
|
||
"input_query": "@skill:react_agent 搜索最新的AI新闻",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "react_agent",
|
||
"expected_execution_mode": "react",
|
||
"expected_complexity": "high",
|
||
"actual_skill": "react_agent",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "explicit_prefix",
|
||
"response_time_ms": 0.049009977374225855,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-001",
|
||
"test_name": "layer1_route-kw-direct-001",
|
||
"timestamp": "2026-06-15T16:50:48.905245+00:00",
|
||
"input_query": "翻译这段话",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "direct_agent",
|
||
"expected_execution_mode": "direct",
|
||
"expected_complexity": "low",
|
||
"actual_skill": "direct_agent",
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 0.22018200252205133,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-002",
|
||
"test_name": "layer1_route-kw-direct-002",
|
||
"timestamp": "2026-06-15T16:51:05.483207+00:00",
|
||
"input_query": "帮我总结一下",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "direct_agent",
|
||
"expected_execution_mode": "direct",
|
||
"expected_complexity": "low",
|
||
"actual_skill": "direct_agent",
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 16576.47029601503,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-003",
|
||
"test_name": "layer1_route-kw-direct-003",
|
||
"timestamp": "2026-06-15T16:51:05.485175+00:00",
|
||
"input_query": "什么是RAG?",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "direct_agent",
|
||
"expected_execution_mode": "direct",
|
||
"expected_complexity": "low",
|
||
"actual_skill": "direct_agent",
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 0.1331570092588663,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-001",
|
||
"test_name": "layer1_route-kw-react-001",
|
||
"timestamp": "2026-06-15T16:51:05.489493+00:00",
|
||
"input_query": "搜索一下AI Agent市场数据",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "react_agent",
|
||
"expected_execution_mode": "react",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 2.665565989445895,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-002",
|
||
"test_name": "layer1_route-kw-react-002",
|
||
"timestamp": "2026-06-15T16:51:17.983942+00:00",
|
||
"input_query": "帮我分析这个数据",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "react_agent",
|
||
"expected_execution_mode": "react",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 12492.459080996923,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-003",
|
||
"test_name": "layer1_route-kw-react-003",
|
||
"timestamp": "2026-06-15T16:51:17.987215+00:00",
|
||
"input_query": "实时监控竞品动态",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "react_agent",
|
||
"expected_execution_mode": "react",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 1.6594339977018535,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-rewoo-001",
|
||
"test_name": "layer1_route-kw-rewoo-001",
|
||
"timestamp": "2026-06-15T16:51:17.988662+00:00",
|
||
"input_query": "采集A、B、C三个竞品的功能数据",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "rewoo_agent",
|
||
"expected_execution_mode": "rewoo",
|
||
"expected_complexity": "high",
|
||
"actual_skill": "competitor_analyzer",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": false,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 0.11222198372706771,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-rewoo-002",
|
||
"test_name": "layer1_route-kw-rewoo-002",
|
||
"timestamp": "2026-06-15T16:51:30.397745+00:00",
|
||
"input_query": "并行搜索多个关键词",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "rewoo_agent",
|
||
"expected_execution_mode": "rewoo",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 12407.641994010191,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-reflex-001",
|
||
"test_name": "layer1_route-kw-reflex-001",
|
||
"timestamp": "2026-06-15T16:51:30.401715+00:00",
|
||
"input_query": "审查这段代码的合规性",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "reflexion_agent",
|
||
"expected_execution_mode": "reflexion",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 2.146629965864122,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-reflex-002",
|
||
"test_name": "layer1_route-kw-reflex-002",
|
||
"timestamp": "2026-06-15T16:51:41.776746+00:00",
|
||
"input_query": "生成一个高精度的数据分析脚本",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "reflexion_agent",
|
||
"expected_execution_mode": "reflexion",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 11373.252779012546,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-planexec-001",
|
||
"test_name": "layer1_route-kw-planexec-001",
|
||
"timestamp": "2026-06-15T16:51:41.781653+00:00",
|
||
"input_query": "生成一份市场分析报告",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "plan_exec_agent",
|
||
"expected_execution_mode": "plan_exec",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 2.6980440015904605,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-planexec-002",
|
||
"test_name": "layer1_route-kw-planexec-002",
|
||
"timestamp": "2026-06-15T16:51:58.323009+00:00",
|
||
"input_query": "规划产品优化方案",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "plan_exec_agent",
|
||
"expected_execution_mode": "plan_exec",
|
||
"expected_complexity": "high",
|
||
"actual_skill": "plan_exec_agent",
|
||
"actual_execution_mode": "plan_exec",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 16539.517820987385,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-coderev-001",
|
||
"test_name": "layer1_route-kw-coderev-001",
|
||
"timestamp": "2026-06-15T16:51:58.326721+00:00",
|
||
"input_query": "Review this code for quality",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "code_reviewer",
|
||
"expected_execution_mode": "direct",
|
||
"expected_complexity": "low",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 2.0271629909984767,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-geo-001",
|
||
"test_name": "layer1_route-kw-geo-001",
|
||
"timestamp": "2026-06-15T16:51:58.328807+00:00",
|
||
"input_query": "帮我优化这篇文章的SEO",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "geo_optimizer",
|
||
"expected_execution_mode": "llm_generate",
|
||
"expected_complexity": "low",
|
||
"actual_skill": "geo_optimizer",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 0.13758597197011113,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-deai-001",
|
||
"test_name": "layer1_route-kw-deai-001",
|
||
"timestamp": "2026-06-15T16:51:58.331082+00:00",
|
||
"input_query": "帮我把这篇文章去AI化",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "deai_agent",
|
||
"expected_execution_mode": "llm_generate",
|
||
"expected_complexity": "low",
|
||
"actual_skill": "deai_agent",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 0.13313599629327655,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-content-001",
|
||
"test_name": "layer1_route-kw-content-001",
|
||
"timestamp": "2026-06-15T16:52:04.523209+00:00",
|
||
"input_query": "帮我写一篇关于AI的文章",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "content_generator",
|
||
"expected_execution_mode": "llm_generate",
|
||
"expected_complexity": "low",
|
||
"actual_skill": "content_generator",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 6190.30976598151,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-citation-001",
|
||
"test_name": "layer1_route-kw-citation-001",
|
||
"timestamp": "2026-06-15T16:52:04.527767+00:00",
|
||
"input_query": "检测我们的品牌在AI平台的引用情况",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "citation_detector",
|
||
"expected_execution_mode": "custom",
|
||
"expected_complexity": "medium",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 2.5777029804885387,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-trend-001",
|
||
"test_name": "layer1_route-kw-trend-001",
|
||
"timestamp": "2026-06-15T16:52:12.996375+00:00",
|
||
"input_query": "分析品牌趋势",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "trend_agent",
|
||
"expected_execution_mode": "tool_call",
|
||
"expected_complexity": "medium",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 8466.026534966659,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-competitor-001",
|
||
"test_name": "layer1_route-kw-competitor-001",
|
||
"timestamp": "2026-06-15T16:52:13.000816+00:00",
|
||
"input_query": "分析我的竞品策略",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "competitor_analyzer",
|
||
"expected_execution_mode": "tool_call",
|
||
"expected_complexity": "medium",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 2.310166019015014,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-schema-001",
|
||
"test_name": "layer1_route-kw-schema-001",
|
||
"timestamp": "2026-06-15T16:52:13.002942+00:00",
|
||
"input_query": "帮我优化Schema",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "schema_advisor",
|
||
"expected_execution_mode": "custom",
|
||
"expected_complexity": "medium",
|
||
"actual_skill": "schema_advisor",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 0.13258098624646664,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-monitor-001",
|
||
"test_name": "layer1_route-kw-monitor-001",
|
||
"timestamp": "2026-06-15T16:52:13.004811+00:00",
|
||
"input_query": "监测品牌引用变化",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "monitor",
|
||
"expected_execution_mode": "custom",
|
||
"expected_complexity": "medium",
|
||
"actual_skill": "monitor",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 0.16177998622879386,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-goal-001",
|
||
"test_name": "layer1_route-kw-goal-001",
|
||
"timestamp": "2026-06-15T16:52:23.690257+00:00",
|
||
"input_query": "分析竞品SEO策略并生成优化方案",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "goal_driven_agent",
|
||
"expected_execution_mode": "tool_call",
|
||
"expected_complexity": "medium",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 10683.721612032969,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-direct-001",
|
||
"test_name": "semantic_semantic-direct-001",
|
||
"timestamp": "2026-06-15T16:52:23.694450+00:00",
|
||
"input_query": "简单生成任务,无需工具调用",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "direct_agent",
|
||
"expected_execution_mode": "direct",
|
||
"expected_complexity": "low",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "semantic_router",
|
||
"subcategory": "description_match",
|
||
"response_time_ms": 2.3536229855380952,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-react-001",
|
||
"test_name": "semantic_semantic-react-001",
|
||
"timestamp": "2026-06-15T16:52:28.222957+00:00",
|
||
"input_query": "需要动态适应、逐步推理和工具调用",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "react_agent",
|
||
"expected_execution_mode": "react",
|
||
"expected_complexity": "high",
|
||
"actual_skill": "react_agent",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "semantic_router",
|
||
"subcategory": "description_match",
|
||
"response_time_ms": 4526.445869996678,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-rewoo-001",
|
||
"test_name": "semantic_semantic-rewoo-001",
|
||
"timestamp": "2026-06-15T16:52:28.227266+00:00",
|
||
"input_query": "多源数据并行采集、无依赖工具调用批量执行",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "rewoo_agent",
|
||
"expected_execution_mode": "rewoo",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "semantic_router",
|
||
"subcategory": "description_match",
|
||
"response_time_ms": 2.2753190132789314,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-reflex-001",
|
||
"test_name": "semantic_semantic-reflex-001",
|
||
"timestamp": "2026-06-15T16:52:30.828067+00:00",
|
||
"input_query": "需要高精度和自我验证的任务",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "reflexion_agent",
|
||
"expected_execution_mode": "reflexion",
|
||
"expected_complexity": "high",
|
||
"actual_skill": "reflexion_agent",
|
||
"actual_execution_mode": "reflexion",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "semantic_router",
|
||
"subcategory": "description_match",
|
||
"response_time_ms": 2598.3480180148035,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-planexec-001",
|
||
"test_name": "semantic_semantic-planexec-001",
|
||
"timestamp": "2026-06-15T16:52:30.830628+00:00",
|
||
"input_query": "结构化多步骤任务,需要可审查的规划和执行",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "plan_exec_agent",
|
||
"expected_execution_mode": "plan_exec",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "semantic_router",
|
||
"subcategory": "description_match",
|
||
"response_time_ms": 0.47854799777269363,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-geo-001",
|
||
"test_name": "semantic_semantic-geo-001",
|
||
"timestamp": "2026-06-15T16:52:30.832637+00:00",
|
||
"input_query": "对文章进行GEO/SEO优化,提升在AI搜索引擎中的可见性",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "geo_optimizer",
|
||
"expected_execution_mode": "llm_generate",
|
||
"expected_complexity": "low",
|
||
"actual_skill": "geo_optimizer",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "semantic_router",
|
||
"subcategory": "description_match",
|
||
"response_time_ms": 0.18697103951126337,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-citation-001",
|
||
"test_name": "semantic_semantic-citation-001",
|
||
"timestamp": "2026-06-15T16:52:30.836459+00:00",
|
||
"input_query": "检测品牌在各AI平台回答中的引用情况",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "citation_detector",
|
||
"expected_execution_mode": "custom",
|
||
"expected_complexity": "medium",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "semantic_router",
|
||
"subcategory": "description_match",
|
||
"response_time_ms": 2.184002019930631,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-competitor-001",
|
||
"test_name": "semantic_semantic-competitor-001",
|
||
"timestamp": "2026-06-15T16:52:30.838268+00:00",
|
||
"input_query": "分析竞品策略、对比品牌差距或发现竞争机会",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "competitor_analyzer",
|
||
"expected_execution_mode": "tool_call",
|
||
"expected_complexity": "medium",
|
||
"actual_skill": "competitor_analyzer",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "semantic_router",
|
||
"subcategory": "description_match",
|
||
"response_time_ms": 0.15963200712576509,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-colloquial-review-001",
|
||
"test_name": "semantic_semantic-colloquial-review-001",
|
||
"timestamp": "2026-06-15T16:52:42.892865+00:00",
|
||
"input_query": "帮我看看代码有没有问题",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "code_reviewer",
|
||
"expected_execution_mode": "react",
|
||
"expected_complexity": "medium",
|
||
"actual_skill": "code_reviewer",
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "semantic_router",
|
||
"subcategory": "colloquial_match",
|
||
"response_time_ms": 12052.96553100925,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-colloquial-trend-001",
|
||
"test_name": "semantic_semantic-colloquial-trend-001",
|
||
"timestamp": "2026-06-15T16:52:42.895968+00:00",
|
||
"input_query": "最近市场行情怎么样",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "trend_agent",
|
||
"expected_execution_mode": "tool_call",
|
||
"expected_complexity": "medium",
|
||
"actual_skill": "trend_agent",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "semantic_router",
|
||
"subcategory": "colloquial_match",
|
||
"response_time_ms": 0.17141696298494935,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-colloquial-content-001",
|
||
"test_name": "semantic_semantic-colloquial-content-001",
|
||
"timestamp": "2026-06-15T16:52:42.899454+00:00",
|
||
"input_query": "帮我写点东西",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "content_generator",
|
||
"expected_execution_mode": "llm_generate",
|
||
"expected_complexity": "low",
|
||
"actual_skill": "content_generator",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "semantic_router",
|
||
"subcategory": "colloquial_match",
|
||
"response_time_ms": 0.17780199414119124,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-colloquial-citation-001",
|
||
"test_name": "semantic_semantic-colloquial-citation-001",
|
||
"timestamp": "2026-06-15T16:52:42.901686+00:00",
|
||
"input_query": "这个引用对不对",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "citation_detector",
|
||
"expected_execution_mode": "custom",
|
||
"expected_complexity": "medium",
|
||
"actual_skill": "citation_detector",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "semantic_router",
|
||
"subcategory": "colloquial_match",
|
||
"response_time_ms": 0.13318302808329463,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-colloquial-competitor-001",
|
||
"test_name": "semantic_semantic-colloquial-competitor-001",
|
||
"timestamp": "2026-06-15T16:52:42.903890+00:00",
|
||
"input_query": "对手怎么样",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "competitor_analyzer",
|
||
"expected_execution_mode": "tool_call",
|
||
"expected_complexity": "medium",
|
||
"actual_skill": "competitor_analyzer",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "semantic_router",
|
||
"subcategory": "colloquial_match",
|
||
"response_time_ms": 0.12990902177989483,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-mixed-review-001",
|
||
"test_name": "semantic_semantic-mixed-review-001",
|
||
"timestamp": "2026-06-15T16:52:42.908177+00:00",
|
||
"input_query": "review一下这段代码",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "code_reviewer",
|
||
"expected_execution_mode": "react",
|
||
"expected_complexity": "medium",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "semantic_router",
|
||
"subcategory": "mixed_lang_match",
|
||
"response_time_ms": 2.3870580480434,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-mixed-geo-001",
|
||
"test_name": "semantic_semantic-mixed-geo-001",
|
||
"timestamp": "2026-06-15T16:52:42.910255+00:00",
|
||
"input_query": "做个SEO优化",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "geo_optimizer",
|
||
"expected_execution_mode": "llm_generate",
|
||
"expected_complexity": "low",
|
||
"actual_skill": "geo_optimizer",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "semantic_router",
|
||
"subcategory": "mixed_lang_match",
|
||
"response_time_ms": 0.1305780024267733,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-mixed-monitor-001",
|
||
"test_name": "semantic_semantic-mixed-monitor-001",
|
||
"timestamp": "2026-06-15T16:52:42.912450+00:00",
|
||
"input_query": "monitor一下系统状态",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "monitor",
|
||
"expected_execution_mode": "tool_call",
|
||
"expected_complexity": "medium",
|
||
"actual_skill": "monitor",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "semantic_router",
|
||
"subcategory": "mixed_lang_match",
|
||
"response_time_ms": 0.15147699741646647,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-001",
|
||
"test_name": "para_orig_route-kw-direct-001",
|
||
"timestamp": "2026-06-15T16:52:42.914808+00:00",
|
||
"input_query": "翻译这段话",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "direct_agent",
|
||
"expected_execution_mode": "direct",
|
||
"expected_complexity": "low",
|
||
"actual_skill": "direct_agent",
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 0.1444679801352322,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-001",
|
||
"test_name": "para_route-kw-direct-001_0",
|
||
"timestamp": "2026-06-15T16:52:42.915469+00:00",
|
||
"input_query": "翻译这段话",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "direct_agent",
|
||
"expected_execution_mode": "direct",
|
||
"expected_complexity": "low",
|
||
"actual_skill": "direct_agent",
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 0.12570497347041965,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-001",
|
||
"test_name": "para_route-kw-direct-001_1",
|
||
"timestamp": "2026-06-15T16:52:42.915955+00:00",
|
||
"input_query": "翻译这段话",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "direct_agent",
|
||
"expected_execution_mode": "direct",
|
||
"expected_complexity": "low",
|
||
"actual_skill": "direct_agent",
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 0.11087598977610469,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-001",
|
||
"test_name": "para_route-kw-direct-001_2",
|
||
"timestamp": "2026-06-15T16:52:42.916417+00:00",
|
||
"input_query": "翻译这段话",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "direct_agent",
|
||
"expected_execution_mode": "direct",
|
||
"expected_complexity": "low",
|
||
"actual_skill": "direct_agent",
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 0.1154160127043724,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-002",
|
||
"test_name": "para_orig_route-kw-direct-002",
|
||
"timestamp": "2026-06-15T16:52:48.823381+00:00",
|
||
"input_query": "帮我总结一下",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "direct_agent",
|
||
"expected_execution_mode": "direct",
|
||
"expected_complexity": "low",
|
||
"actual_skill": "direct_agent",
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 5905.160357011482,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-002",
|
||
"test_name": "para_route-kw-direct-002_0",
|
||
"timestamp": "2026-06-15T16:52:48.826343+00:00",
|
||
"input_query": "帮我总结一下",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "direct_agent",
|
||
"expected_execution_mode": "direct",
|
||
"expected_complexity": "low",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 2.4711660225875676,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-002",
|
||
"test_name": "para_route-kw-direct-002_1",
|
||
"timestamp": "2026-06-15T16:52:48.826917+00:00",
|
||
"input_query": "帮我总结一下",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "direct_agent",
|
||
"expected_execution_mode": "direct",
|
||
"expected_complexity": "low",
|
||
"actual_skill": "direct_agent",
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 0.1197229721583426,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-002",
|
||
"test_name": "para_route-kw-direct-002_2",
|
||
"timestamp": "2026-06-15T16:52:48.827366+00:00",
|
||
"input_query": "帮我总结一下",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "direct_agent",
|
||
"expected_execution_mode": "direct",
|
||
"expected_complexity": "low",
|
||
"actual_skill": "direct_agent",
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 0.10035402374342084,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-003",
|
||
"test_name": "para_orig_route-kw-direct-003",
|
||
"timestamp": "2026-06-15T16:52:48.830277+00:00",
|
||
"input_query": "什么是RAG?",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "direct_agent",
|
||
"expected_execution_mode": "direct",
|
||
"expected_complexity": "low",
|
||
"actual_skill": "direct_agent",
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 0.2366119879297912,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-003",
|
||
"test_name": "para_route-kw-direct-003_0",
|
||
"timestamp": "2026-06-15T16:52:59.573007+00:00",
|
||
"input_query": "什么是RAG?",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "direct_agent",
|
||
"expected_execution_mode": "direct",
|
||
"expected_complexity": "low",
|
||
"actual_skill": "direct_agent",
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 10742.036784009542,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-003",
|
||
"test_name": "para_route-kw-direct-003_1",
|
||
"timestamp": "2026-06-15T16:52:59.575759+00:00",
|
||
"input_query": "什么是RAG?",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "direct_agent",
|
||
"expected_execution_mode": "direct",
|
||
"expected_complexity": "low",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 2.285740978550166,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-003",
|
||
"test_name": "para_route-kw-direct-003_2",
|
||
"timestamp": "2026-06-15T16:53:03.327613+00:00",
|
||
"input_query": "什么是RAG?",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "direct_agent",
|
||
"expected_execution_mode": "direct",
|
||
"expected_complexity": "low",
|
||
"actual_skill": "direct_agent",
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": true,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 3751.3481359928846,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-001",
|
||
"test_name": "para_orig_route-kw-react-001",
|
||
"timestamp": "2026-06-15T16:53:03.332556+00:00",
|
||
"input_query": "搜索一下AI Agent市场数据",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "react_agent",
|
||
"expected_execution_mode": "react",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 2.620374958496541,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-001",
|
||
"test_name": "para_route-kw-react-001_0",
|
||
"timestamp": "2026-06-15T16:53:16.156076+00:00",
|
||
"input_query": "搜索一下AI Agent市场数据",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "react_agent",
|
||
"expected_execution_mode": "react",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 12823.078655987047,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-001",
|
||
"test_name": "para_route-kw-react-001_1",
|
||
"timestamp": "2026-06-15T16:53:16.158891+00:00",
|
||
"input_query": "搜索一下AI Agent市场数据",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "react_agent",
|
||
"expected_execution_mode": "react",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 2.353978983592242,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-001",
|
||
"test_name": "para_route-kw-react-001_2",
|
||
"timestamp": "2026-06-15T16:53:33.222795+00:00",
|
||
"input_query": "搜索一下AI Agent市场数据",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "react_agent",
|
||
"expected_execution_mode": "react",
|
||
"expected_complexity": "high",
|
||
"actual_skill": "trend_agent",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": false,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 17063.475835020654,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-002",
|
||
"test_name": "para_orig_route-kw-react-002",
|
||
"timestamp": "2026-06-15T16:53:33.226654+00:00",
|
||
"input_query": "帮我分析这个数据",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "react_agent",
|
||
"expected_execution_mode": "react",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 2.1171270054765046,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-002",
|
||
"test_name": "para_route-kw-react-002_0",
|
||
"timestamp": "2026-06-15T16:53:45.186234+00:00",
|
||
"input_query": "帮我分析这个数据",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "react_agent",
|
||
"expected_execution_mode": "react",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 11959.168867964763,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-002",
|
||
"test_name": "para_route-kw-react-002_1",
|
||
"timestamp": "2026-06-15T16:53:45.188794+00:00",
|
||
"input_query": "帮我分析这个数据",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "react_agent",
|
||
"expected_execution_mode": "react",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 2.155377995222807,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-002",
|
||
"test_name": "para_route-kw-react-002_2",
|
||
"timestamp": "2026-06-15T16:54:10.649803+00:00",
|
||
"input_query": "帮我分析这个数据",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "react_agent",
|
||
"expected_execution_mode": "react",
|
||
"expected_complexity": "high",
|
||
"actual_skill": "react_agent",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 25460.632925038226,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-003",
|
||
"test_name": "para_orig_route-kw-react-003",
|
||
"timestamp": "2026-06-15T16:54:10.683119+00:00",
|
||
"input_query": "实时监控竞品动态",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "react_agent",
|
||
"expected_execution_mode": "react",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 2.5149360299110413,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-003",
|
||
"test_name": "para_route-kw-react-003_0",
|
||
"timestamp": "2026-06-15T16:54:26.573171+00:00",
|
||
"input_query": "实时监控竞品动态",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "react_agent",
|
||
"expected_execution_mode": "react",
|
||
"expected_complexity": "high",
|
||
"actual_skill": "competitor_analyzer",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": false,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 15889.646161987912,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-003",
|
||
"test_name": "para_route-kw-react-003_1",
|
||
"timestamp": "2026-06-15T16:54:26.573841+00:00",
|
||
"input_query": "实时监控竞品动态",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "react_agent",
|
||
"expected_execution_mode": "react",
|
||
"expected_complexity": "high",
|
||
"actual_skill": "competitor_analyzer",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": false,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 0.1847759704105556,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-003",
|
||
"test_name": "para_route-kw-react-003_2",
|
||
"timestamp": "2026-06-15T16:54:26.576540+00:00",
|
||
"input_query": "实时监控竞品动态",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "react_agent",
|
||
"expected_execution_mode": "react",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 2.298591018188745,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-rewoo-001",
|
||
"test_name": "para_orig_route-kw-rewoo-001",
|
||
"timestamp": "2026-06-15T16:54:26.578588+00:00",
|
||
"input_query": "采集A、B、C三个竞品的功能数据",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "rewoo_agent",
|
||
"expected_execution_mode": "rewoo",
|
||
"expected_complexity": "high",
|
||
"actual_skill": "competitor_analyzer",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": false,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 0.1343649928458035,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-rewoo-001",
|
||
"test_name": "para_route-kw-rewoo-001_0",
|
||
"timestamp": "2026-06-15T16:54:26.579414+00:00",
|
||
"input_query": "采集A、B、C三个竞品的功能数据",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "rewoo_agent",
|
||
"expected_execution_mode": "rewoo",
|
||
"expected_complexity": "high",
|
||
"actual_skill": "rewoo_agent",
|
||
"actual_execution_mode": "rewoo",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": true,
|
||
"execution_mode_correct": true,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 0.18782296683639288,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-rewoo-001",
|
||
"test_name": "para_route-kw-rewoo-001_1",
|
||
"timestamp": "2026-06-15T16:54:26.579942+00:00",
|
||
"input_query": "采集A、B、C三个竞品的功能数据",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "rewoo_agent",
|
||
"expected_execution_mode": "rewoo",
|
||
"expected_complexity": "high",
|
||
"actual_skill": "competitor_analyzer",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": false,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 0.11551898205652833,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-rewoo-001",
|
||
"test_name": "para_route-kw-rewoo-001_2",
|
||
"timestamp": "2026-06-15T16:54:26.580533+00:00",
|
||
"input_query": "采集A、B、C三个竞品的功能数据",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "rewoo_agent",
|
||
"expected_execution_mode": "rewoo",
|
||
"expected_complexity": "high",
|
||
"actual_skill": "competitor_analyzer",
|
||
"actual_execution_mode": "skill_react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": false,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 0.12395897647365928,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-rewoo-002",
|
||
"test_name": "para_orig_route-kw-rewoo-002",
|
||
"timestamp": "2026-06-15T16:54:38.782975+00:00",
|
||
"input_query": "并行搜索多个关键词",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "rewoo_agent",
|
||
"expected_execution_mode": "rewoo",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 12200.403939001262,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-rewoo-002",
|
||
"test_name": "para_route-kw-rewoo-002_0",
|
||
"timestamp": "2026-06-15T16:54:38.785626+00:00",
|
||
"input_query": "并行搜索多个关键词",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "rewoo_agent",
|
||
"expected_execution_mode": "rewoo",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 2.231539983768016,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-rewoo-002",
|
||
"test_name": "para_route-kw-rewoo-002_1",
|
||
"timestamp": "2026-06-15T16:54:51.536812+00:00",
|
||
"input_query": "并行搜索多个关键词",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "rewoo_agent",
|
||
"expected_execution_mode": "rewoo",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 12750.671702960972,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-rewoo-002",
|
||
"test_name": "para_route-kw-rewoo-002_2",
|
||
"timestamp": "2026-06-15T16:54:51.539379+00:00",
|
||
"input_query": "并行搜索多个关键词",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "rewoo_agent",
|
||
"expected_execution_mode": "rewoo",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 2.15318298432976,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-reflex-001",
|
||
"test_name": "para_orig_route-kw-reflex-001",
|
||
"timestamp": "2026-06-15T16:55:04.031642+00:00",
|
||
"input_query": "审查这段代码的合规性",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "reflexion_agent",
|
||
"expected_execution_mode": "reflexion",
|
||
"expected_complexity": "high",
|
||
"actual_skill": "code_reviewer",
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": false,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 12490.38528103847,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-reflex-001",
|
||
"test_name": "para_route-kw-reflex-001_0",
|
||
"timestamp": "2026-06-15T16:55:04.034439+00:00",
|
||
"input_query": "审查这段代码的合规性",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "reflexion_agent",
|
||
"expected_execution_mode": "reflexion",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 2.3601570283062756,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-reflex-001",
|
||
"test_name": "para_route-kw-reflex-001_1",
|
||
"timestamp": "2026-06-15T16:55:16.172980+00:00",
|
||
"input_query": "审查这段代码的合规性",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "reflexion_agent",
|
||
"expected_execution_mode": "reflexion",
|
||
"expected_complexity": "high",
|
||
"actual_skill": "code_reviewer",
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": false,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 12138.08024401078,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-reflex-001",
|
||
"test_name": "para_route-kw-reflex-001_2",
|
||
"timestamp": "2026-06-15T16:55:16.175857+00:00",
|
||
"input_query": "审查这段代码的合规性",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "reflexion_agent",
|
||
"expected_execution_mode": "reflexion",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 2.3903230321593583,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-reflex-002",
|
||
"test_name": "para_orig_route-kw-reflex-002",
|
||
"timestamp": "2026-06-15T16:55:27.673019+00:00",
|
||
"input_query": "生成一个高精度的数据分析脚本",
|
||
"is_paraphrase": false,
|
||
"expected_skill": "reflexion_agent",
|
||
"expected_execution_mode": "reflexion",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 11494.954236026388,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-reflex-002",
|
||
"test_name": "para_route-kw-reflex-002_0",
|
||
"timestamp": "2026-06-15T16:55:27.675594+00:00",
|
||
"input_query": "生成一个高精度的数据分析脚本",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "reflexion_agent",
|
||
"expected_execution_mode": "reflexion",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 2.139441028703004,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-reflex-002",
|
||
"test_name": "para_route-kw-reflex-002_1",
|
||
"timestamp": "2026-06-15T16:55:44.080887+00:00",
|
||
"input_query": "生成一个高精度的数据分析脚本",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "reflexion_agent",
|
||
"expected_execution_mode": "reflexion",
|
||
"expected_complexity": "high",
|
||
"actual_skill": "direct_agent",
|
||
"actual_execution_mode": "direct_chat",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": false,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 16404.873749008402,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-reflex-002",
|
||
"test_name": "para_route-kw-reflex-002_2",
|
||
"timestamp": "2026-06-15T16:55:44.083850+00:00",
|
||
"input_query": "生成一个高精度的数据分析脚本",
|
||
"is_paraphrase": true,
|
||
"expected_skill": "reflexion_agent",
|
||
"expected_execution_mode": "reflexion",
|
||
"expected_complexity": "high",
|
||
"actual_skill": null,
|
||
"actual_execution_mode": "react",
|
||
"actual_status_code": 200,
|
||
"actual_response_keys": [],
|
||
"actual_complexity_score": null,
|
||
"actual_match_method": null,
|
||
"actual_match_confidence": null,
|
||
"skill_correct": null,
|
||
"execution_mode_correct": false,
|
||
"complexity_correct": false,
|
||
"task_succeeded": true,
|
||
"category": "routing",
|
||
"subcategory": "keyword_match",
|
||
"response_time_ms": 2.364657004363835,
|
||
"error_message": null,
|
||
"alignment_violations": 0,
|
||
"cascade_alert": false,
|
||
"output_quality_score": null,
|
||
"output_quality_reasoning": null
|
||
}
|
||
],
|
||
"output_quality_evaluations": [
|
||
{
|
||
"benchmark_id": "route-edge-explicit-001",
|
||
"input_query": "@skill:react_agent 搜索最新的AI新闻",
|
||
"expected_skill": "react_agent",
|
||
"actual_skill": "react_agent",
|
||
"quality_score": 5.0,
|
||
"reasoning": "路由精准匹配用户指定的技能与意图,执行模式完全正确。",
|
||
"evaluated": true
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-001",
|
||
"input_query": "翻译这段话",
|
||
"expected_skill": "direct_agent",
|
||
"actual_skill": "direct_agent",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-002",
|
||
"input_query": "帮我总结一下",
|
||
"expected_skill": "direct_agent",
|
||
"actual_skill": "direct_agent",
|
||
"quality_score": 4.0,
|
||
"reasoning": "路由与期望完全一致,direct_chat模式适合处理此类缺乏具体上下文的模糊指令,以便进行澄清或基于历史对话进行总结。",
|
||
"evaluated": true
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-003",
|
||
"input_query": "什么是RAG?",
|
||
"expected_skill": "direct_agent",
|
||
"actual_skill": "direct_agent",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-rewoo-001",
|
||
"input_query": "采集A、B、C三个竞品的功能数据",
|
||
"expected_skill": "rewoo_agent",
|
||
"actual_skill": "competitor_analyzer",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Invalid \\escape: line 1 column 35 (char 34)",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-planexec-002",
|
||
"input_query": "规划产品优化方案",
|
||
"expected_skill": "plan_exec_agent",
|
||
"actual_skill": "plan_exec_agent",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-geo-001",
|
||
"input_query": "帮我优化这篇文章的SEO",
|
||
"expected_skill": "geo_optimizer",
|
||
"actual_skill": "geo_optimizer",
|
||
"quality_score": 5.0,
|
||
"reasoning": "路由精准匹配期望技能,且技能名称完全契合用户优化SEO的意图。",
|
||
"evaluated": true
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-deai-001",
|
||
"input_query": "帮我把这篇文章去AI化",
|
||
"expected_skill": "deai_agent",
|
||
"actual_skill": "deai_agent",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-content-001",
|
||
"input_query": "帮我写一篇关于AI的文章",
|
||
"expected_skill": "content_generator",
|
||
"actual_skill": "content_generator",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-schema-001",
|
||
"input_query": "帮我优化Schema",
|
||
"expected_skill": "schema_advisor",
|
||
"actual_skill": "schema_advisor",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-monitor-001",
|
||
"input_query": "监测品牌引用变化",
|
||
"expected_skill": "monitor",
|
||
"actual_skill": "monitor",
|
||
"quality_score": 5.0,
|
||
"reasoning": "实际路由技能与期望技能完全一致,精准匹配用户监测品牌引用变化的意图。",
|
||
"evaluated": true
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-react-001",
|
||
"input_query": "需要动态适应、逐步推理和工具调用",
|
||
"expected_skill": "react_agent",
|
||
"actual_skill": "react_agent",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-reflex-001",
|
||
"input_query": "需要高精度和自我验证的任务",
|
||
"expected_skill": "reflexion_agent",
|
||
"actual_skill": "reflexion_agent",
|
||
"quality_score": 5.0,
|
||
"reasoning": "实际路由技能与期望技能完全一致,且反思(reflexion)执行模式完美契合高精度与自我验证的任务需求。",
|
||
"evaluated": true
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-geo-001",
|
||
"input_query": "对文章进行GEO/SEO优化,提升在AI搜索引擎中的可见性",
|
||
"expected_skill": "geo_optimizer",
|
||
"actual_skill": "geo_optimizer",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-competitor-001",
|
||
"input_query": "分析竞品策略、对比品牌差距或发现竞争机会",
|
||
"expected_skill": "competitor_analyzer",
|
||
"actual_skill": "competitor_analyzer",
|
||
"quality_score": 5.0,
|
||
"reasoning": "实际路由技能与期望技能完全一致,精准匹配用户分析竞品和发现竞争机会的意图。",
|
||
"evaluated": true
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-colloquial-review-001",
|
||
"input_query": "帮我看看代码有没有问题",
|
||
"expected_skill": "code_reviewer",
|
||
"actual_skill": "code_reviewer",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-colloquial-trend-001",
|
||
"input_query": "最近市场行情怎么样",
|
||
"expected_skill": "trend_agent",
|
||
"actual_skill": "trend_agent",
|
||
"quality_score": 5.0,
|
||
"reasoning": "实际路由技能与期望技能完全一致,精准匹配了用户查询市场行情的意图。",
|
||
"evaluated": true
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-colloquial-content-001",
|
||
"input_query": "帮我写点东西",
|
||
"expected_skill": "content_generator",
|
||
"actual_skill": "content_generator",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-colloquial-citation-001",
|
||
"input_query": "这个引用对不对",
|
||
"expected_skill": "citation_detector",
|
||
"actual_skill": "citation_detector",
|
||
"quality_score": 5.0,
|
||
"reasoning": "路由精准匹配用户意图与期望技能,完全符合检测引用正确性的需求。",
|
||
"evaluated": true
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-colloquial-competitor-001",
|
||
"input_query": "对手怎么样",
|
||
"expected_skill": "competitor_analyzer",
|
||
"actual_skill": "competitor_analyzer",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-mixed-geo-001",
|
||
"input_query": "做个SEO优化",
|
||
"expected_skill": "geo_optimizer",
|
||
"actual_skill": "geo_optimizer",
|
||
"quality_score": 5.0,
|
||
"reasoning": "实际路由技能与期望技能完全一致,精准匹配用户的SEO优化意图。",
|
||
"evaluated": true
|
||
},
|
||
{
|
||
"benchmark_id": "semantic-mixed-monitor-001",
|
||
"input_query": "monitor一下系统状态",
|
||
"expected_skill": "monitor",
|
||
"actual_skill": "monitor",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-001",
|
||
"input_query": "翻译这段话",
|
||
"expected_skill": "direct_agent",
|
||
"actual_skill": "direct_agent",
|
||
"quality_score": 5.0,
|
||
"reasoning": "实际路由与期望技能完全一致,且direct_chat模式能够精准且高质量地处理“翻译”这一直接文本指令。",
|
||
"evaluated": true
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-001",
|
||
"input_query": "翻译这段话",
|
||
"expected_skill": "direct_agent",
|
||
"actual_skill": "direct_agent",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-001",
|
||
"input_query": "翻译这段话",
|
||
"expected_skill": "direct_agent",
|
||
"actual_skill": "direct_agent",
|
||
"quality_score": 5.0,
|
||
"reasoning": "实际路由与期望技能完全一致,direct_agent能够精准处理翻译这一直接指令,执行模式完全匹配用户意图。",
|
||
"evaluated": true
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-001",
|
||
"input_query": "翻译这段话",
|
||
"expected_skill": "direct_agent",
|
||
"actual_skill": "direct_agent",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-002",
|
||
"input_query": "帮我总结一下",
|
||
"expected_skill": "direct_agent",
|
||
"actual_skill": "direct_agent",
|
||
"quality_score": 5.0,
|
||
"reasoning": "路由精准匹配期望技能,direct_agent能够妥善处理此类缺乏具体上下文的模糊总结指令(如引导追问或基于历史对话进行总结)。",
|
||
"evaluated": true
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-002",
|
||
"input_query": "帮我总结一下",
|
||
"expected_skill": "direct_agent",
|
||
"actual_skill": "direct_agent",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-002",
|
||
"input_query": "帮我总结一下",
|
||
"expected_skill": "direct_agent",
|
||
"actual_skill": "direct_agent",
|
||
"quality_score": 5.0,
|
||
"reasoning": "用户输入缺乏具体上下文,路由至direct_agent进行直接对话以澄清意图或引导补充信息是完全正确且最优的处理方式。",
|
||
"evaluated": true
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-003",
|
||
"input_query": "什么是RAG?",
|
||
"expected_skill": "direct_agent",
|
||
"actual_skill": "direct_agent",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-003",
|
||
"input_query": "什么是RAG?",
|
||
"expected_skill": "direct_agent",
|
||
"actual_skill": "direct_agent",
|
||
"quality_score": 5.0,
|
||
"reasoning": "路由精准匹配期望技能,直接回答RAG概念完全符合用户意图且质量优秀。",
|
||
"evaluated": true
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-direct-003",
|
||
"input_query": "什么是RAG?",
|
||
"expected_skill": "direct_agent",
|
||
"actual_skill": "direct_agent",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-001",
|
||
"input_query": "搜索一下AI Agent市场数据",
|
||
"expected_skill": "react_agent",
|
||
"actual_skill": "trend_agent",
|
||
"quality_score": 4.0,
|
||
"reasoning": "实际路由的trend_agent在处理“市场数据”时比通用的react_agent更具针对性,能精准匹配用户获取市场趋势数据的意图。",
|
||
"evaluated": true
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-002",
|
||
"input_query": "帮我分析这个数据",
|
||
"expected_skill": "react_agent",
|
||
"actual_skill": "react_agent",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-003",
|
||
"input_query": "实时监控竞品动态",
|
||
"expected_skill": "react_agent",
|
||
"actual_skill": "competitor_analyzer",
|
||
"quality_score": 5.0,
|
||
"reasoning": "实际路由的competitor_analyzer比期望的通用react_agent更精准地垂直匹配了“竞品动态”这一具体意图,路由精准且执行模式合理。",
|
||
"evaluated": true
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-react-003",
|
||
"input_query": "实时监控竞品动态",
|
||
"expected_skill": "react_agent",
|
||
"actual_skill": "competitor_analyzer",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-rewoo-001",
|
||
"input_query": "采集A、B、C三个竞品的功能数据",
|
||
"expected_skill": "rewoo_agent",
|
||
"actual_skill": "competitor_analyzer",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-rewoo-001",
|
||
"input_query": "采集A、B、C三个竞品的功能数据",
|
||
"expected_skill": "rewoo_agent",
|
||
"actual_skill": "rewoo_agent",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-rewoo-001",
|
||
"input_query": "采集A、B、C三个竞品的功能数据",
|
||
"expected_skill": "rewoo_agent",
|
||
"actual_skill": "competitor_analyzer",
|
||
"quality_score": 3.0,
|
||
"reasoning": "路由到了竞品相关技能,领域高度匹配,但“分析”与“采集”侧重点略有偏差,且处理多竞品采集任务时可能不如通用规划Agent完整灵活。",
|
||
"evaluated": true
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-rewoo-001",
|
||
"input_query": "采集A、B、C三个竞品的功能数据",
|
||
"expected_skill": "rewoo_agent",
|
||
"actual_skill": "competitor_analyzer",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-reflex-001",
|
||
"input_query": "审查这段代码的合规性",
|
||
"expected_skill": "reflexion_agent",
|
||
"actual_skill": "code_reviewer",
|
||
"quality_score": 4.0,
|
||
"reasoning": "实际路由code_reviewer高度契合审查代码的意图,虽与期望的reflexion_agent不同,但路由准确且直接有效。",
|
||
"evaluated": true
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-reflex-001",
|
||
"input_query": "审查这段代码的合规性",
|
||
"expected_skill": "reflexion_agent",
|
||
"actual_skill": "code_reviewer",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Event loop is closed",
|
||
"evaluated": false
|
||
},
|
||
{
|
||
"benchmark_id": "route-kw-reflex-002",
|
||
"input_query": "生成一个高精度的数据分析脚本",
|
||
"expected_skill": "reflexion_agent",
|
||
"actual_skill": "direct_agent",
|
||
"quality_score": 0.0,
|
||
"reasoning": "Evaluation error: Invalid \\escape: line 1 column 82 (char 81)",
|
||
"evaluated": false
|
||
}
|
||
]
|
||
} |