{ "generated_at": "2026-06-15T16:59:06.575194+00:00", "total_observations": 82, "overall_skill_recall": 0.9, "overall_skill_precision": 0.9, "overall_skill_f1": 0.9, "overall_execution_mode_accuracy": 0.4038, "overall_task_success_rate": 1.0, "category_metrics": [ { "category": "routing", "subcategory": "explicit_prefix", "total": 1, "skill_correct": 1, "skill_recall": 1.0, "skill_precision": 1.0, "skill_f1": 1.0, "execution_mode_correct": 1, "execution_mode_accuracy": 1.0, "complexity_correct": 0, "complexity_accuracy": 0.0, "task_success_rate": 1.0, "avg_response_time_ms": 0.05 }, { "category": "routing", "subcategory": "greeting", "total": 2, "skill_correct": 2, "skill_recall": 1.0, "skill_precision": 1.0, "skill_f1": 1.0, "execution_mode_correct": 2, "execution_mode_accuracy": 1.0, "complexity_correct": 2, "complexity_accuracy": 1.0, "task_success_rate": 1.0, "avg_response_time_ms": 0.03 }, { "category": "routing", "subcategory": "identity", "total": 1, "skill_correct": 1, "skill_recall": 1.0, "skill_precision": 1.0, "skill_f1": 1.0, "execution_mode_correct": 1, "execution_mode_accuracy": 1.0, "complexity_correct": 1, "complexity_accuracy": 1.0, "task_success_rate": 1.0, "avg_response_time_ms": 0.02 }, { "category": "routing", "subcategory": "keyword_match", "total": 62, "skill_correct": 21, "skill_recall": 0.6774, "skill_precision": 0.6774, "skill_f1": 0.6774, "execution_mode_correct": 32, "execution_mode_accuracy": 0.5161, "complexity_correct": 22, "complexity_accuracy": 0.3548, "task_success_rate": 1.0, "avg_response_time_ms": 4449.27 }, { "category": "semantic_router", "subcategory": "colloquial_match", "total": 5, "skill_correct": 5, "skill_recall": 1.0, "skill_precision": 1.0, "skill_f1": 1.0, "execution_mode_correct": 0, "execution_mode_accuracy": 0.0, "complexity_correct": 1, "complexity_accuracy": 0.2, "task_success_rate": 1.0, "avg_response_time_ms": 2410.72 }, { "category": "semantic_router", "subcategory": "description_match", "total": 8, "skill_correct": 4, "skill_recall": 1.0, "skill_precision": 1.0, "skill_f1": 1.0, "execution_mode_correct": 3, "execution_mode_accuracy": 0.375, "complexity_correct": 2, "complexity_accuracy": 0.25, "task_success_rate": 1.0, "avg_response_time_ms": 891.55 }, { "category": "semantic_router", "subcategory": "mixed_lang_match", "total": 3, "skill_correct": 2, "skill_recall": 1.0, "skill_precision": 1.0, "skill_f1": 1.0, "execution_mode_correct": 1, "execution_mode_accuracy": 0.3333, "complexity_correct": 2, "complexity_accuracy": 0.6667, "task_success_rate": 1.0, "avg_response_time_ms": 0.89 } ], "overfitting_results": [ { "benchmark_id": "route-kw-direct-001", "original_correct": true, "paraphrase_results": [ true, true, true ], "consistency_rate": 1.0, "is_overfitted": false }, { "benchmark_id": "route-kw-direct-002", "original_correct": true, "paraphrase_results": [ false, true, true ], "consistency_rate": 0.6667, "is_overfitted": false }, { "benchmark_id": "route-kw-direct-003", "original_correct": true, "paraphrase_results": [ true, false, true ], "consistency_rate": 0.6667, "is_overfitted": false }, { "benchmark_id": "route-kw-react-001", "original_correct": false, "paraphrase_results": [ false, false, false ], "consistency_rate": 1.0, "is_overfitted": false }, { "benchmark_id": "route-kw-react-002", "original_correct": false, "paraphrase_results": [ false, false, true ], "consistency_rate": 0.6667, "is_overfitted": false }, { "benchmark_id": "route-kw-react-003", "original_correct": false, "paraphrase_results": [ false, false, false ], "consistency_rate": 1.0, "is_overfitted": false }, { "benchmark_id": "route-kw-rewoo-001", "original_correct": false, "paraphrase_results": [ true, false, false ], "consistency_rate": 0.6667, "is_overfitted": false }, { "benchmark_id": "route-kw-rewoo-002", "original_correct": false, "paraphrase_results": [ false, false, false ], "consistency_rate": 1.0, "is_overfitted": false }, { "benchmark_id": "route-kw-reflex-001", "original_correct": false, "paraphrase_results": [ false, false, false ], "consistency_rate": 1.0, "is_overfitted": false }, { "benchmark_id": "route-kw-reflex-002", "original_correct": false, "paraphrase_results": [ false, false, false ], "consistency_rate": 1.0, "is_overfitted": false } ], "overfitting_score": 0.2222, "weaknesses": [ { "dimension": "semantic_router", "subcategory": "colloquial_match", "severity": "high", "description": "执行模式准确率过低 (0.00%),子类别: colloquial_match", "evidence": "正确数=0/5", "suggestion": "检查复杂度估算和模式选择逻辑" }, { "dimension": "semantic_router", "subcategory": "description_match", "severity": "high", "description": "执行模式准确率过低 (37.50%),子类别: description_match", "evidence": "正确数=3/8", "suggestion": "检查复杂度估算和模式选择逻辑" }, { "dimension": "semantic_router", "subcategory": "mixed_lang_match", "severity": "high", "description": "执行模式准确率过低 (33.33%),子类别: mixed_lang_match", "evidence": "正确数=1/3", "suggestion": "检查复杂度估算和模式选择逻辑" }, { "dimension": "routing", "subcategory": "keyword_match", "severity": "medium", "description": "技能路由F1偏低 (0.68),子类别: keyword_match", "evidence": "召回率=67.74%, 精确率=67.74%, 样本数=62", "suggestion": "微调路由阈值或增加更多意图示例" }, { "dimension": "routing", "subcategory": "keyword_match", "severity": "medium", "description": "执行模式准确率过低 (51.61%),子类别: keyword_match", "evidence": "正确数=32/62", "suggestion": "检查复杂度估算和模式选择逻辑" } ], "root_causes": [ { "cause_type": "complexity_misjudge", "cause_description": "复杂度估算偏差:倾向高估复杂度(将简单任务误判为需要多步推理)", "confidence": 0.75, "affected_cases": [ "route-kw-rewoo-001", "route-kw-rewoo-002", "route-kw-reflex-001", "route-kw-reflex-002", "route-kw-planexec-001", "route-kw-coderev-001", "route-kw-geo-001", "route-kw-deai-001", "route-kw-content-001", "route-kw-citation-001" ], "detail": "共 31 个执行模式判断错误。低估复杂度 0 次,高估复杂度 1 次。受影响子类别: description_match, mixed_lang_match, colloquial_match, keyword_match" }, { "cause_type": "intent_ambiguous", "cause_description": "意图歧义:不同技能的关键词/意图描述重叠,导致路由混淆", "confidence": 0.7, "affected_cases": [ "route-kw-rewoo-001", "route-kw-rewoo-001", "route-kw-reflex-001" ], "detail": "技能混淆对: rewoo_agent→competitor_analyzer(2次); reflexion_agent→code_reviewer(1次)" }, { "cause_type": "quality_threshold", "cause_description": "质量门控阈值过低:任务虽成功完成但输出了错误结果", "confidence": 0.6, "affected_cases": [ "route-kw-rewoo-001", "route-kw-rewoo-001", "route-kw-reflex-001" ], "detail": "共 3 个任务虽然HTTP成功但路由到了错误技能。质量门控未能拦截这些错误路由的结果。" } ], "improvement_plans": [ { "weakness_description": "意图歧义:不同技能的关键词/意图描述重叠,导致路由混淆", "root_causes": [ { "cause_type": "intent_ambiguous", "cause_description": "意图歧义:不同技能的关键词/意图描述重叠,导致路由混淆", "confidence": 0.7, "affected_cases": [ "route-kw-rewoo-001", "route-kw-rewoo-001", "route-kw-reflex-001" ], "detail": "技能混淆对: rewoo_agent→competitor_analyzer(2次); reflexion_agent→code_reviewer(1次)" } ], "actions": [ { "action_id": "ACT-001", "title": "为易混淆技能添加互斥关键词", "description": "在技能配置中为容易混淆的技能对添加互斥关键词(disambiguation_keywords),当用户输入同时匹配多个技能时,优先选择包含互斥关键词的技能。", "target_module": "configs/skills/*.yaml → intent.disambiguation_keywords", "priority": "P1", "expected_impact": "预计提升精确率 10~25%,减少技能混淆", "effort": "small", "related_causes": [ "intent_ambiguous" ], "verification": "运行歧义消解回测,验证路由精确率提升" }, { "action_id": "ACT-002", "title": "实现LLM二次分类消歧", "description": "当 Layer 0/1 路由到多个候选技能时,调用 LLM quick_classify 进行二次意图判断,选择最匹配的技能。", "target_module": "src/agentkit/chat/skill_routing.py → Layer 1", "priority": "P2", "expected_impact": "预计提升精确率 15~30%,但增加 ~500ms 延迟和 ~100 tokens", "effort": "medium", "related_causes": [ "intent_ambiguous" ], "verification": "运行歧义消解回测,对比延迟和精确率变化" } ], "overall_strategy": "短期:添加互斥关键词消歧;中期:启用LLM二次分类;长期:训练专用意图分类模型替代规则匹配" }, { "weakness_description": "复杂度估算偏差:倾向高估复杂度(将简单任务误判为需要多步推理)", "root_causes": [ { "cause_type": "complexity_misjudge", "cause_description": "复杂度估算偏差:倾向高估复杂度(将简单任务误判为需要多步推理)", "confidence": 0.75, "affected_cases": [ "route-kw-rewoo-001", "route-kw-rewoo-002", "route-kw-reflex-001", "route-kw-reflex-002", "route-kw-planexec-001", "route-kw-coderev-001", "route-kw-geo-001", "route-kw-deai-001", "route-kw-content-001", "route-kw-citation-001" ], "detail": "共 31 个执行模式判断错误。低估复杂度 0 次,高估复杂度 1 次。受影响子类别: description_match, mixed_lang_match, colloquial_match, keyword_match" } ], "actions": [ { "action_id": "ACT-003", "title": "优化复杂度估算启发式规则", "description": "调整 HeuristicClassifier 的复杂度评分权重:增加任务动词(分析/研究/设计)的权重,降低简单问答动词(是什么/多少)的权重。", "target_module": "src/agentkit/chat/skill_routing.py → HeuristicClassifier", "priority": "P1", "expected_impact": "预计提升执行模式准确率 10~20%", "effort": "small", "related_causes": [ "complexity_misjudge" ], "verification": "运行执行模式回测,验证准确率提升" }, { "action_id": "ACT-004", "title": "引入任务复杂度校准数据集", "description": "收集标注了复杂度等级的真实用户查询,构建校准数据集,定期评估和调整复杂度阈值。", "target_module": "tests/e2e/benchmark_dataset.py", "priority": "P2", "expected_impact": "持续提升复杂度判断准确性", "effort": "medium", "related_causes": [ "complexity_misjudge" ], "verification": "每次调整后运行回测,对比前后F1变化" } ], "overall_strategy": "短期:调整启发式规则权重;中期:构建复杂度校准数据集;长期:训练复杂度评估模型替代规则" }, { "weakness_description": "质量门控阈值过低:任务虽成功完成但输出了错误结果", "root_causes": [ { "cause_type": "quality_threshold", "cause_description": "质量门控阈值过低:任务虽成功完成但输出了错误结果", "confidence": 0.6, "affected_cases": [ "route-kw-rewoo-001", "route-kw-rewoo-001", "route-kw-reflex-001" ], "detail": "共 3 个任务虽然HTTP成功但路由到了错误技能。质量门控未能拦截这些错误路由的结果。" } ], "actions": [ { "action_id": "ACT-005", "title": "增强质量门控的技能匹配验证", "description": "在QualityGate中增加技能匹配验证:检查输出是否与路由到的技能的能力范围一致,如果不一致则触发重试或降级。", "target_module": "src/agentkit/quality/gate.py", "priority": "P1", "expected_impact": "减少错误路由导致的低质量输出", "effort": "medium", "related_causes": [ "quality_threshold" ], "verification": "运行质量门控回测,验证错误路由拦截率" } ], "overall_strategy": "短期:增加技能匹配验证;中期:引入输出质量评分模型;长期:实现自动质量回归检测" } ], "raw_observations": [ { "benchmark_id": "route-edge-greet-001", "test_name": "layer0_route-edge-greet-001", "timestamp": "2026-06-15T16:50:48.898798+00:00", "input_query": "你好", "is_paraphrase": false, "expected_skill": null, "expected_execution_mode": "direct", "expected_complexity": "low", "actual_skill": null, "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "greeting", "response_time_ms": 0.031158037018030882, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-edge-greet-002", "test_name": "layer0_route-edge-greet-002", "timestamp": "2026-06-15T16:50:48.900854+00:00", "input_query": "Good morning!", "is_paraphrase": false, "expected_skill": null, "expected_execution_mode": "direct", "expected_complexity": "low", "actual_skill": null, "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "greeting", "response_time_ms": 0.02913799835368991, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-edge-identity-001", "test_name": "layer0_route-edge-identity-001", "timestamp": "2026-06-15T16:50:48.902316+00:00", "input_query": "你是谁?", "is_paraphrase": false, "expected_skill": null, "expected_execution_mode": "direct", "expected_complexity": "low", "actual_skill": null, "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "identity", "response_time_ms": 0.020894978661090136, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-edge-explicit-001", "test_name": "layer0_route-edge-explicit-001", "timestamp": "2026-06-15T16:50:48.903641+00:00", "input_query": "@skill:react_agent 搜索最新的AI新闻", "is_paraphrase": false, "expected_skill": "react_agent", "expected_execution_mode": "react", "expected_complexity": "high", "actual_skill": "react_agent", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "explicit_prefix", "response_time_ms": 0.049009977374225855, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-direct-001", "test_name": "layer1_route-kw-direct-001", "timestamp": "2026-06-15T16:50:48.905245+00:00", "input_query": "翻译这段话", "is_paraphrase": false, "expected_skill": "direct_agent", "expected_execution_mode": "direct", "expected_complexity": "low", "actual_skill": "direct_agent", "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 0.22018200252205133, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-direct-002", "test_name": "layer1_route-kw-direct-002", "timestamp": "2026-06-15T16:51:05.483207+00:00", "input_query": "帮我总结一下", "is_paraphrase": false, "expected_skill": "direct_agent", "expected_execution_mode": "direct", "expected_complexity": "low", "actual_skill": "direct_agent", "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 16576.47029601503, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-direct-003", "test_name": "layer1_route-kw-direct-003", "timestamp": "2026-06-15T16:51:05.485175+00:00", "input_query": "什么是RAG?", "is_paraphrase": false, "expected_skill": "direct_agent", "expected_execution_mode": "direct", "expected_complexity": "low", "actual_skill": "direct_agent", "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 0.1331570092588663, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-react-001", "test_name": "layer1_route-kw-react-001", "timestamp": "2026-06-15T16:51:05.489493+00:00", "input_query": "搜索一下AI Agent市场数据", "is_paraphrase": false, "expected_skill": "react_agent", "expected_execution_mode": "react", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": true, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 2.665565989445895, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-react-002", "test_name": "layer1_route-kw-react-002", "timestamp": "2026-06-15T16:51:17.983942+00:00", "input_query": "帮我分析这个数据", "is_paraphrase": false, "expected_skill": "react_agent", "expected_execution_mode": "react", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": true, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 12492.459080996923, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-react-003", "test_name": "layer1_route-kw-react-003", "timestamp": "2026-06-15T16:51:17.987215+00:00", "input_query": "实时监控竞品动态", "is_paraphrase": false, "expected_skill": "react_agent", "expected_execution_mode": "react", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": true, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 1.6594339977018535, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-rewoo-001", "test_name": "layer1_route-kw-rewoo-001", "timestamp": "2026-06-15T16:51:17.988662+00:00", "input_query": "采集A、B、C三个竞品的功能数据", "is_paraphrase": false, "expected_skill": "rewoo_agent", "expected_execution_mode": "rewoo", "expected_complexity": "high", "actual_skill": "competitor_analyzer", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": false, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 0.11222198372706771, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-rewoo-002", "test_name": "layer1_route-kw-rewoo-002", "timestamp": "2026-06-15T16:51:30.397745+00:00", "input_query": "并行搜索多个关键词", "is_paraphrase": false, "expected_skill": "rewoo_agent", "expected_execution_mode": "rewoo", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 12407.641994010191, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-reflex-001", "test_name": "layer1_route-kw-reflex-001", "timestamp": "2026-06-15T16:51:30.401715+00:00", "input_query": "审查这段代码的合规性", "is_paraphrase": false, "expected_skill": "reflexion_agent", "expected_execution_mode": "reflexion", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 2.146629965864122, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-reflex-002", "test_name": "layer1_route-kw-reflex-002", "timestamp": "2026-06-15T16:51:41.776746+00:00", "input_query": "生成一个高精度的数据分析脚本", "is_paraphrase": false, "expected_skill": "reflexion_agent", "expected_execution_mode": "reflexion", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 11373.252779012546, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-planexec-001", "test_name": "layer1_route-kw-planexec-001", "timestamp": "2026-06-15T16:51:41.781653+00:00", "input_query": "生成一份市场分析报告", "is_paraphrase": false, "expected_skill": "plan_exec_agent", "expected_execution_mode": "plan_exec", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 2.6980440015904605, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-planexec-002", "test_name": "layer1_route-kw-planexec-002", "timestamp": "2026-06-15T16:51:58.323009+00:00", "input_query": "规划产品优化方案", "is_paraphrase": false, "expected_skill": "plan_exec_agent", "expected_execution_mode": "plan_exec", "expected_complexity": "high", "actual_skill": "plan_exec_agent", "actual_execution_mode": "plan_exec", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 16539.517820987385, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-coderev-001", "test_name": "layer1_route-kw-coderev-001", "timestamp": "2026-06-15T16:51:58.326721+00:00", "input_query": "Review this code for quality", "is_paraphrase": false, "expected_skill": "code_reviewer", "expected_execution_mode": "direct", "expected_complexity": "low", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 2.0271629909984767, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-geo-001", "test_name": "layer1_route-kw-geo-001", "timestamp": "2026-06-15T16:51:58.328807+00:00", "input_query": "帮我优化这篇文章的SEO", "is_paraphrase": false, "expected_skill": "geo_optimizer", "expected_execution_mode": "llm_generate", "expected_complexity": "low", "actual_skill": "geo_optimizer", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": false, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 0.13758597197011113, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-deai-001", "test_name": "layer1_route-kw-deai-001", "timestamp": "2026-06-15T16:51:58.331082+00:00", "input_query": "帮我把这篇文章去AI化", "is_paraphrase": false, "expected_skill": "deai_agent", "expected_execution_mode": "llm_generate", "expected_complexity": "low", "actual_skill": "deai_agent", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": false, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 0.13313599629327655, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-content-001", "test_name": "layer1_route-kw-content-001", "timestamp": "2026-06-15T16:52:04.523209+00:00", "input_query": "帮我写一篇关于AI的文章", "is_paraphrase": false, "expected_skill": "content_generator", "expected_execution_mode": "llm_generate", "expected_complexity": "low", "actual_skill": "content_generator", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": false, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 6190.30976598151, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-citation-001", "test_name": "layer1_route-kw-citation-001", "timestamp": "2026-06-15T16:52:04.527767+00:00", "input_query": "检测我们的品牌在AI平台的引用情况", "is_paraphrase": false, "expected_skill": "citation_detector", "expected_execution_mode": "custom", "expected_complexity": "medium", "actual_skill": null, "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 2.5777029804885387, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-trend-001", "test_name": "layer1_route-kw-trend-001", "timestamp": "2026-06-15T16:52:12.996375+00:00", "input_query": "分析品牌趋势", "is_paraphrase": false, "expected_skill": "trend_agent", "expected_execution_mode": "tool_call", "expected_complexity": "medium", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 8466.026534966659, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-competitor-001", "test_name": "layer1_route-kw-competitor-001", "timestamp": "2026-06-15T16:52:13.000816+00:00", "input_query": "分析我的竞品策略", "is_paraphrase": false, "expected_skill": "competitor_analyzer", "expected_execution_mode": "tool_call", "expected_complexity": "medium", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 2.310166019015014, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-schema-001", "test_name": "layer1_route-kw-schema-001", "timestamp": "2026-06-15T16:52:13.002942+00:00", "input_query": "帮我优化Schema", "is_paraphrase": false, "expected_skill": "schema_advisor", "expected_execution_mode": "custom", "expected_complexity": "medium", "actual_skill": "schema_advisor", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 0.13258098624646664, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-monitor-001", "test_name": "layer1_route-kw-monitor-001", "timestamp": "2026-06-15T16:52:13.004811+00:00", "input_query": "监测品牌引用变化", "is_paraphrase": false, "expected_skill": "monitor", "expected_execution_mode": "custom", "expected_complexity": "medium", "actual_skill": "monitor", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 0.16177998622879386, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-goal-001", "test_name": "layer1_route-kw-goal-001", "timestamp": "2026-06-15T16:52:23.690257+00:00", "input_query": "分析竞品SEO策略并生成优化方案", "is_paraphrase": false, "expected_skill": "goal_driven_agent", "expected_execution_mode": "tool_call", "expected_complexity": "medium", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 10683.721612032969, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "semantic-direct-001", "test_name": "semantic_semantic-direct-001", "timestamp": "2026-06-15T16:52:23.694450+00:00", "input_query": "简单生成任务,无需工具调用", "is_paraphrase": false, "expected_skill": "direct_agent", "expected_execution_mode": "direct", "expected_complexity": "low", "actual_skill": null, "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "semantic_router", "subcategory": "description_match", "response_time_ms": 2.3536229855380952, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "semantic-react-001", "test_name": "semantic_semantic-react-001", "timestamp": "2026-06-15T16:52:28.222957+00:00", "input_query": "需要动态适应、逐步推理和工具调用", "is_paraphrase": false, "expected_skill": "react_agent", "expected_execution_mode": "react", "expected_complexity": "high", "actual_skill": "react_agent", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": false, "task_succeeded": true, "category": "semantic_router", "subcategory": "description_match", "response_time_ms": 4526.445869996678, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "semantic-rewoo-001", "test_name": "semantic_semantic-rewoo-001", "timestamp": "2026-06-15T16:52:28.227266+00:00", "input_query": "多源数据并行采集、无依赖工具调用批量执行", "is_paraphrase": false, "expected_skill": "rewoo_agent", "expected_execution_mode": "rewoo", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "semantic_router", "subcategory": "description_match", "response_time_ms": 2.2753190132789314, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "semantic-reflex-001", "test_name": "semantic_semantic-reflex-001", "timestamp": "2026-06-15T16:52:30.828067+00:00", "input_query": "需要高精度和自我验证的任务", "is_paraphrase": false, "expected_skill": "reflexion_agent", "expected_execution_mode": "reflexion", "expected_complexity": "high", "actual_skill": "reflexion_agent", "actual_execution_mode": "reflexion", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": false, "task_succeeded": true, "category": "semantic_router", "subcategory": "description_match", "response_time_ms": 2598.3480180148035, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "semantic-planexec-001", "test_name": "semantic_semantic-planexec-001", "timestamp": "2026-06-15T16:52:30.830628+00:00", "input_query": "结构化多步骤任务,需要可审查的规划和执行", "is_paraphrase": false, "expected_skill": "plan_exec_agent", "expected_execution_mode": "plan_exec", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": true, "task_succeeded": true, "category": "semantic_router", "subcategory": "description_match", "response_time_ms": 0.47854799777269363, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "semantic-geo-001", "test_name": "semantic_semantic-geo-001", "timestamp": "2026-06-15T16:52:30.832637+00:00", "input_query": "对文章进行GEO/SEO优化,提升在AI搜索引擎中的可见性", "is_paraphrase": false, "expected_skill": "geo_optimizer", "expected_execution_mode": "llm_generate", "expected_complexity": "low", "actual_skill": "geo_optimizer", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "semantic_router", "subcategory": "description_match", "response_time_ms": 0.18697103951126337, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "semantic-citation-001", "test_name": "semantic_semantic-citation-001", "timestamp": "2026-06-15T16:52:30.836459+00:00", "input_query": "检测品牌在各AI平台回答中的引用情况", "is_paraphrase": false, "expected_skill": "citation_detector", "expected_execution_mode": "custom", "expected_complexity": "medium", "actual_skill": null, "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "semantic_router", "subcategory": "description_match", "response_time_ms": 2.184002019930631, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "semantic-competitor-001", "test_name": "semantic_semantic-competitor-001", "timestamp": "2026-06-15T16:52:30.838268+00:00", "input_query": "分析竞品策略、对比品牌差距或发现竞争机会", "is_paraphrase": false, "expected_skill": "competitor_analyzer", "expected_execution_mode": "tool_call", "expected_complexity": "medium", "actual_skill": "competitor_analyzer", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "semantic_router", "subcategory": "description_match", "response_time_ms": 0.15963200712576509, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "semantic-colloquial-review-001", "test_name": "semantic_semantic-colloquial-review-001", "timestamp": "2026-06-15T16:52:42.892865+00:00", "input_query": "帮我看看代码有没有问题", "is_paraphrase": false, "expected_skill": "code_reviewer", "expected_execution_mode": "react", "expected_complexity": "medium", "actual_skill": "code_reviewer", "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "semantic_router", "subcategory": "colloquial_match", "response_time_ms": 12052.96553100925, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "semantic-colloquial-trend-001", "test_name": "semantic_semantic-colloquial-trend-001", "timestamp": "2026-06-15T16:52:42.895968+00:00", "input_query": "最近市场行情怎么样", "is_paraphrase": false, "expected_skill": "trend_agent", "expected_execution_mode": "tool_call", "expected_complexity": "medium", "actual_skill": "trend_agent", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "semantic_router", "subcategory": "colloquial_match", "response_time_ms": 0.17141696298494935, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "semantic-colloquial-content-001", "test_name": "semantic_semantic-colloquial-content-001", "timestamp": "2026-06-15T16:52:42.899454+00:00", "input_query": "帮我写点东西", "is_paraphrase": false, "expected_skill": "content_generator", "expected_execution_mode": "llm_generate", "expected_complexity": "low", "actual_skill": "content_generator", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": false, "complexity_correct": true, "task_succeeded": true, "category": "semantic_router", "subcategory": "colloquial_match", "response_time_ms": 0.17780199414119124, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "semantic-colloquial-citation-001", "test_name": "semantic_semantic-colloquial-citation-001", "timestamp": "2026-06-15T16:52:42.901686+00:00", "input_query": "这个引用对不对", "is_paraphrase": false, "expected_skill": "citation_detector", "expected_execution_mode": "custom", "expected_complexity": "medium", "actual_skill": "citation_detector", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "semantic_router", "subcategory": "colloquial_match", "response_time_ms": 0.13318302808329463, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "semantic-colloquial-competitor-001", "test_name": "semantic_semantic-colloquial-competitor-001", "timestamp": "2026-06-15T16:52:42.903890+00:00", "input_query": "对手怎么样", "is_paraphrase": false, "expected_skill": "competitor_analyzer", "expected_execution_mode": "tool_call", "expected_complexity": "medium", "actual_skill": "competitor_analyzer", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "semantic_router", "subcategory": "colloquial_match", "response_time_ms": 0.12990902177989483, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "semantic-mixed-review-001", "test_name": "semantic_semantic-mixed-review-001", "timestamp": "2026-06-15T16:52:42.908177+00:00", "input_query": "review一下这段代码", "is_paraphrase": false, "expected_skill": "code_reviewer", "expected_execution_mode": "react", "expected_complexity": "medium", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "semantic_router", "subcategory": "mixed_lang_match", "response_time_ms": 2.3870580480434, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "semantic-mixed-geo-001", "test_name": "semantic_semantic-mixed-geo-001", "timestamp": "2026-06-15T16:52:42.910255+00:00", "input_query": "做个SEO优化", "is_paraphrase": false, "expected_skill": "geo_optimizer", "expected_execution_mode": "llm_generate", "expected_complexity": "low", "actual_skill": "geo_optimizer", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": false, "complexity_correct": true, "task_succeeded": true, "category": "semantic_router", "subcategory": "mixed_lang_match", "response_time_ms": 0.1305780024267733, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "semantic-mixed-monitor-001", "test_name": "semantic_semantic-mixed-monitor-001", "timestamp": "2026-06-15T16:52:42.912450+00:00", "input_query": "monitor一下系统状态", "is_paraphrase": false, "expected_skill": "monitor", "expected_execution_mode": "tool_call", "expected_complexity": "medium", "actual_skill": "monitor", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "semantic_router", "subcategory": "mixed_lang_match", "response_time_ms": 0.15147699741646647, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-direct-001", "test_name": "para_orig_route-kw-direct-001", "timestamp": "2026-06-15T16:52:42.914808+00:00", "input_query": "翻译这段话", "is_paraphrase": false, "expected_skill": "direct_agent", "expected_execution_mode": "direct", "expected_complexity": "low", "actual_skill": "direct_agent", "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 0.1444679801352322, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-direct-001", "test_name": "para_route-kw-direct-001_0", "timestamp": "2026-06-15T16:52:42.915469+00:00", "input_query": "翻译这段话", "is_paraphrase": true, "expected_skill": "direct_agent", "expected_execution_mode": "direct", "expected_complexity": "low", "actual_skill": "direct_agent", "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 0.12570497347041965, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-direct-001", "test_name": "para_route-kw-direct-001_1", "timestamp": "2026-06-15T16:52:42.915955+00:00", "input_query": "翻译这段话", "is_paraphrase": true, "expected_skill": "direct_agent", "expected_execution_mode": "direct", "expected_complexity": "low", "actual_skill": "direct_agent", "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 0.11087598977610469, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-direct-001", "test_name": "para_route-kw-direct-001_2", "timestamp": "2026-06-15T16:52:42.916417+00:00", "input_query": "翻译这段话", "is_paraphrase": true, "expected_skill": "direct_agent", "expected_execution_mode": "direct", "expected_complexity": "low", "actual_skill": "direct_agent", "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 0.1154160127043724, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-direct-002", "test_name": "para_orig_route-kw-direct-002", "timestamp": "2026-06-15T16:52:48.823381+00:00", "input_query": "帮我总结一下", "is_paraphrase": false, "expected_skill": "direct_agent", "expected_execution_mode": "direct", "expected_complexity": "low", "actual_skill": "direct_agent", "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 5905.160357011482, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-direct-002", "test_name": "para_route-kw-direct-002_0", "timestamp": "2026-06-15T16:52:48.826343+00:00", "input_query": "帮我总结一下", "is_paraphrase": true, "expected_skill": "direct_agent", "expected_execution_mode": "direct", "expected_complexity": "low", "actual_skill": null, "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 2.4711660225875676, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-direct-002", "test_name": "para_route-kw-direct-002_1", "timestamp": "2026-06-15T16:52:48.826917+00:00", "input_query": "帮我总结一下", "is_paraphrase": true, "expected_skill": "direct_agent", "expected_execution_mode": "direct", "expected_complexity": "low", "actual_skill": "direct_agent", "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 0.1197229721583426, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-direct-002", "test_name": "para_route-kw-direct-002_2", "timestamp": "2026-06-15T16:52:48.827366+00:00", "input_query": "帮我总结一下", "is_paraphrase": true, "expected_skill": "direct_agent", "expected_execution_mode": "direct", "expected_complexity": "low", "actual_skill": "direct_agent", "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 0.10035402374342084, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-direct-003", "test_name": "para_orig_route-kw-direct-003", "timestamp": "2026-06-15T16:52:48.830277+00:00", "input_query": "什么是RAG?", "is_paraphrase": false, "expected_skill": "direct_agent", "expected_execution_mode": "direct", "expected_complexity": "low", "actual_skill": "direct_agent", "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 0.2366119879297912, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-direct-003", "test_name": "para_route-kw-direct-003_0", "timestamp": "2026-06-15T16:52:59.573007+00:00", "input_query": "什么是RAG?", "is_paraphrase": true, "expected_skill": "direct_agent", "expected_execution_mode": "direct", "expected_complexity": "low", "actual_skill": "direct_agent", "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 10742.036784009542, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-direct-003", "test_name": "para_route-kw-direct-003_1", "timestamp": "2026-06-15T16:52:59.575759+00:00", "input_query": "什么是RAG?", "is_paraphrase": true, "expected_skill": "direct_agent", "expected_execution_mode": "direct", "expected_complexity": "low", "actual_skill": null, "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 2.285740978550166, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-direct-003", "test_name": "para_route-kw-direct-003_2", "timestamp": "2026-06-15T16:53:03.327613+00:00", "input_query": "什么是RAG?", "is_paraphrase": true, "expected_skill": "direct_agent", "expected_execution_mode": "direct", "expected_complexity": "low", "actual_skill": "direct_agent", "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": true, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 3751.3481359928846, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-react-001", "test_name": "para_orig_route-kw-react-001", "timestamp": "2026-06-15T16:53:03.332556+00:00", "input_query": "搜索一下AI Agent市场数据", "is_paraphrase": false, "expected_skill": "react_agent", "expected_execution_mode": "react", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": true, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 2.620374958496541, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-react-001", "test_name": "para_route-kw-react-001_0", "timestamp": "2026-06-15T16:53:16.156076+00:00", "input_query": "搜索一下AI Agent市场数据", "is_paraphrase": true, "expected_skill": "react_agent", "expected_execution_mode": "react", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": true, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 12823.078655987047, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-react-001", "test_name": "para_route-kw-react-001_1", "timestamp": "2026-06-15T16:53:16.158891+00:00", "input_query": "搜索一下AI Agent市场数据", "is_paraphrase": true, "expected_skill": "react_agent", "expected_execution_mode": "react", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": true, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 2.353978983592242, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-react-001", "test_name": "para_route-kw-react-001_2", "timestamp": "2026-06-15T16:53:33.222795+00:00", "input_query": "搜索一下AI Agent市场数据", "is_paraphrase": true, "expected_skill": "react_agent", "expected_execution_mode": "react", "expected_complexity": "high", "actual_skill": "trend_agent", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": false, "execution_mode_correct": true, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 17063.475835020654, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-react-002", "test_name": "para_orig_route-kw-react-002", "timestamp": "2026-06-15T16:53:33.226654+00:00", "input_query": "帮我分析这个数据", "is_paraphrase": false, "expected_skill": "react_agent", "expected_execution_mode": "react", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": true, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 2.1171270054765046, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-react-002", "test_name": "para_route-kw-react-002_0", "timestamp": "2026-06-15T16:53:45.186234+00:00", "input_query": "帮我分析这个数据", "is_paraphrase": true, "expected_skill": "react_agent", "expected_execution_mode": "react", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": true, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 11959.168867964763, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-react-002", "test_name": "para_route-kw-react-002_1", "timestamp": "2026-06-15T16:53:45.188794+00:00", "input_query": "帮我分析这个数据", "is_paraphrase": true, "expected_skill": "react_agent", "expected_execution_mode": "react", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": true, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 2.155377995222807, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-react-002", "test_name": "para_route-kw-react-002_2", "timestamp": "2026-06-15T16:54:10.649803+00:00", "input_query": "帮我分析这个数据", "is_paraphrase": true, "expected_skill": "react_agent", "expected_execution_mode": "react", "expected_complexity": "high", "actual_skill": "react_agent", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 25460.632925038226, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-react-003", "test_name": "para_orig_route-kw-react-003", "timestamp": "2026-06-15T16:54:10.683119+00:00", "input_query": "实时监控竞品动态", "is_paraphrase": false, "expected_skill": "react_agent", "expected_execution_mode": "react", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": true, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 2.5149360299110413, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-react-003", "test_name": "para_route-kw-react-003_0", "timestamp": "2026-06-15T16:54:26.573171+00:00", "input_query": "实时监控竞品动态", "is_paraphrase": true, "expected_skill": "react_agent", "expected_execution_mode": "react", "expected_complexity": "high", "actual_skill": "competitor_analyzer", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": false, "execution_mode_correct": true, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 15889.646161987912, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-react-003", "test_name": "para_route-kw-react-003_1", "timestamp": "2026-06-15T16:54:26.573841+00:00", "input_query": "实时监控竞品动态", "is_paraphrase": true, "expected_skill": "react_agent", "expected_execution_mode": "react", "expected_complexity": "high", "actual_skill": "competitor_analyzer", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": false, "execution_mode_correct": true, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 0.1847759704105556, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-react-003", "test_name": "para_route-kw-react-003_2", "timestamp": "2026-06-15T16:54:26.576540+00:00", "input_query": "实时监控竞品动态", "is_paraphrase": true, "expected_skill": "react_agent", "expected_execution_mode": "react", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": true, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 2.298591018188745, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-rewoo-001", "test_name": "para_orig_route-kw-rewoo-001", "timestamp": "2026-06-15T16:54:26.578588+00:00", "input_query": "采集A、B、C三个竞品的功能数据", "is_paraphrase": false, "expected_skill": "rewoo_agent", "expected_execution_mode": "rewoo", "expected_complexity": "high", "actual_skill": "competitor_analyzer", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": false, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 0.1343649928458035, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-rewoo-001", "test_name": "para_route-kw-rewoo-001_0", "timestamp": "2026-06-15T16:54:26.579414+00:00", "input_query": "采集A、B、C三个竞品的功能数据", "is_paraphrase": true, "expected_skill": "rewoo_agent", "expected_execution_mode": "rewoo", "expected_complexity": "high", "actual_skill": "rewoo_agent", "actual_execution_mode": "rewoo", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": true, "execution_mode_correct": true, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 0.18782296683639288, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-rewoo-001", "test_name": "para_route-kw-rewoo-001_1", "timestamp": "2026-06-15T16:54:26.579942+00:00", "input_query": "采集A、B、C三个竞品的功能数据", "is_paraphrase": true, "expected_skill": "rewoo_agent", "expected_execution_mode": "rewoo", "expected_complexity": "high", "actual_skill": "competitor_analyzer", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": false, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 0.11551898205652833, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-rewoo-001", "test_name": "para_route-kw-rewoo-001_2", "timestamp": "2026-06-15T16:54:26.580533+00:00", "input_query": "采集A、B、C三个竞品的功能数据", "is_paraphrase": true, "expected_skill": "rewoo_agent", "expected_execution_mode": "rewoo", "expected_complexity": "high", "actual_skill": "competitor_analyzer", "actual_execution_mode": "skill_react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": false, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 0.12395897647365928, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-rewoo-002", "test_name": "para_orig_route-kw-rewoo-002", "timestamp": "2026-06-15T16:54:38.782975+00:00", "input_query": "并行搜索多个关键词", "is_paraphrase": false, "expected_skill": "rewoo_agent", "expected_execution_mode": "rewoo", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 12200.403939001262, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-rewoo-002", "test_name": "para_route-kw-rewoo-002_0", "timestamp": "2026-06-15T16:54:38.785626+00:00", "input_query": "并行搜索多个关键词", "is_paraphrase": true, "expected_skill": "rewoo_agent", "expected_execution_mode": "rewoo", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 2.231539983768016, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-rewoo-002", "test_name": "para_route-kw-rewoo-002_1", "timestamp": "2026-06-15T16:54:51.536812+00:00", "input_query": "并行搜索多个关键词", "is_paraphrase": true, "expected_skill": "rewoo_agent", "expected_execution_mode": "rewoo", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 12750.671702960972, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-rewoo-002", "test_name": "para_route-kw-rewoo-002_2", "timestamp": "2026-06-15T16:54:51.539379+00:00", "input_query": "并行搜索多个关键词", "is_paraphrase": true, "expected_skill": "rewoo_agent", "expected_execution_mode": "rewoo", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 2.15318298432976, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-reflex-001", "test_name": "para_orig_route-kw-reflex-001", "timestamp": "2026-06-15T16:55:04.031642+00:00", "input_query": "审查这段代码的合规性", "is_paraphrase": false, "expected_skill": "reflexion_agent", "expected_execution_mode": "reflexion", "expected_complexity": "high", "actual_skill": "code_reviewer", "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": false, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 12490.38528103847, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-reflex-001", "test_name": "para_route-kw-reflex-001_0", "timestamp": "2026-06-15T16:55:04.034439+00:00", "input_query": "审查这段代码的合规性", "is_paraphrase": true, "expected_skill": "reflexion_agent", "expected_execution_mode": "reflexion", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 2.3601570283062756, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-reflex-001", "test_name": "para_route-kw-reflex-001_1", "timestamp": "2026-06-15T16:55:16.172980+00:00", "input_query": "审查这段代码的合规性", "is_paraphrase": true, "expected_skill": "reflexion_agent", "expected_execution_mode": "reflexion", "expected_complexity": "high", "actual_skill": "code_reviewer", "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": false, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 12138.08024401078, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-reflex-001", "test_name": "para_route-kw-reflex-001_2", "timestamp": "2026-06-15T16:55:16.175857+00:00", "input_query": "审查这段代码的合规性", "is_paraphrase": true, "expected_skill": "reflexion_agent", "expected_execution_mode": "reflexion", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 2.3903230321593583, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-reflex-002", "test_name": "para_orig_route-kw-reflex-002", "timestamp": "2026-06-15T16:55:27.673019+00:00", "input_query": "生成一个高精度的数据分析脚本", "is_paraphrase": false, "expected_skill": "reflexion_agent", "expected_execution_mode": "reflexion", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 11494.954236026388, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-reflex-002", "test_name": "para_route-kw-reflex-002_0", "timestamp": "2026-06-15T16:55:27.675594+00:00", "input_query": "生成一个高精度的数据分析脚本", "is_paraphrase": true, "expected_skill": "reflexion_agent", "expected_execution_mode": "reflexion", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 2.139441028703004, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-reflex-002", "test_name": "para_route-kw-reflex-002_1", "timestamp": "2026-06-15T16:55:44.080887+00:00", "input_query": "生成一个高精度的数据分析脚本", "is_paraphrase": true, "expected_skill": "reflexion_agent", "expected_execution_mode": "reflexion", "expected_complexity": "high", "actual_skill": "direct_agent", "actual_execution_mode": "direct_chat", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": false, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 16404.873749008402, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null }, { "benchmark_id": "route-kw-reflex-002", "test_name": "para_route-kw-reflex-002_2", "timestamp": "2026-06-15T16:55:44.083850+00:00", "input_query": "生成一个高精度的数据分析脚本", "is_paraphrase": true, "expected_skill": "reflexion_agent", "expected_execution_mode": "reflexion", "expected_complexity": "high", "actual_skill": null, "actual_execution_mode": "react", "actual_status_code": 200, "actual_response_keys": [], "actual_complexity_score": null, "actual_match_method": null, "actual_match_confidence": null, "skill_correct": null, "execution_mode_correct": false, "complexity_correct": false, "task_succeeded": true, "category": "routing", "subcategory": "keyword_match", "response_time_ms": 2.364657004363835, "error_message": null, "alignment_violations": 0, "cascade_alert": false, "output_quality_score": null, "output_quality_reasoning": null } ], "output_quality_evaluations": [ { "benchmark_id": "route-edge-explicit-001", "input_query": "@skill:react_agent 搜索最新的AI新闻", "expected_skill": "react_agent", "actual_skill": "react_agent", "quality_score": 5.0, "reasoning": "路由精准匹配用户指定的技能与意图,执行模式完全正确。", "evaluated": true }, { "benchmark_id": "route-kw-direct-001", "input_query": "翻译这段话", "expected_skill": "direct_agent", "actual_skill": "direct_agent", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "route-kw-direct-002", "input_query": "帮我总结一下", "expected_skill": "direct_agent", "actual_skill": "direct_agent", "quality_score": 4.0, "reasoning": "路由与期望完全一致,direct_chat模式适合处理此类缺乏具体上下文的模糊指令,以便进行澄清或基于历史对话进行总结。", "evaluated": true }, { "benchmark_id": "route-kw-direct-003", "input_query": "什么是RAG?", "expected_skill": "direct_agent", "actual_skill": "direct_agent", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "route-kw-rewoo-001", "input_query": "采集A、B、C三个竞品的功能数据", "expected_skill": "rewoo_agent", "actual_skill": "competitor_analyzer", "quality_score": 0.0, "reasoning": "Evaluation error: Invalid \\escape: line 1 column 35 (char 34)", "evaluated": false }, { "benchmark_id": "route-kw-planexec-002", "input_query": "规划产品优化方案", "expected_skill": "plan_exec_agent", "actual_skill": "plan_exec_agent", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "route-kw-geo-001", "input_query": "帮我优化这篇文章的SEO", "expected_skill": "geo_optimizer", "actual_skill": "geo_optimizer", "quality_score": 5.0, "reasoning": "路由精准匹配期望技能,且技能名称完全契合用户优化SEO的意图。", "evaluated": true }, { "benchmark_id": "route-kw-deai-001", "input_query": "帮我把这篇文章去AI化", "expected_skill": "deai_agent", "actual_skill": "deai_agent", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "route-kw-content-001", "input_query": "帮我写一篇关于AI的文章", "expected_skill": "content_generator", "actual_skill": "content_generator", "quality_score": 0.0, "reasoning": "Evaluation error: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)", "evaluated": false }, { "benchmark_id": "route-kw-schema-001", "input_query": "帮我优化Schema", "expected_skill": "schema_advisor", "actual_skill": "schema_advisor", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "route-kw-monitor-001", "input_query": "监测品牌引用变化", "expected_skill": "monitor", "actual_skill": "monitor", "quality_score": 5.0, "reasoning": "实际路由技能与期望技能完全一致,精准匹配用户监测品牌引用变化的意图。", "evaluated": true }, { "benchmark_id": "semantic-react-001", "input_query": "需要动态适应、逐步推理和工具调用", "expected_skill": "react_agent", "actual_skill": "react_agent", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "semantic-reflex-001", "input_query": "需要高精度和自我验证的任务", "expected_skill": "reflexion_agent", "actual_skill": "reflexion_agent", "quality_score": 5.0, "reasoning": "实际路由技能与期望技能完全一致,且反思(reflexion)执行模式完美契合高精度与自我验证的任务需求。", "evaluated": true }, { "benchmark_id": "semantic-geo-001", "input_query": "对文章进行GEO/SEO优化,提升在AI搜索引擎中的可见性", "expected_skill": "geo_optimizer", "actual_skill": "geo_optimizer", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "semantic-competitor-001", "input_query": "分析竞品策略、对比品牌差距或发现竞争机会", "expected_skill": "competitor_analyzer", "actual_skill": "competitor_analyzer", "quality_score": 5.0, "reasoning": "实际路由技能与期望技能完全一致,精准匹配用户分析竞品和发现竞争机会的意图。", "evaluated": true }, { "benchmark_id": "semantic-colloquial-review-001", "input_query": "帮我看看代码有没有问题", "expected_skill": "code_reviewer", "actual_skill": "code_reviewer", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "semantic-colloquial-trend-001", "input_query": "最近市场行情怎么样", "expected_skill": "trend_agent", "actual_skill": "trend_agent", "quality_score": 5.0, "reasoning": "实际路由技能与期望技能完全一致,精准匹配了用户查询市场行情的意图。", "evaluated": true }, { "benchmark_id": "semantic-colloquial-content-001", "input_query": "帮我写点东西", "expected_skill": "content_generator", "actual_skill": "content_generator", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "semantic-colloquial-citation-001", "input_query": "这个引用对不对", "expected_skill": "citation_detector", "actual_skill": "citation_detector", "quality_score": 5.0, "reasoning": "路由精准匹配用户意图与期望技能,完全符合检测引用正确性的需求。", "evaluated": true }, { "benchmark_id": "semantic-colloquial-competitor-001", "input_query": "对手怎么样", "expected_skill": "competitor_analyzer", "actual_skill": "competitor_analyzer", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "semantic-mixed-geo-001", "input_query": "做个SEO优化", "expected_skill": "geo_optimizer", "actual_skill": "geo_optimizer", "quality_score": 5.0, "reasoning": "实际路由技能与期望技能完全一致,精准匹配用户的SEO优化意图。", "evaluated": true }, { "benchmark_id": "semantic-mixed-monitor-001", "input_query": "monitor一下系统状态", "expected_skill": "monitor", "actual_skill": "monitor", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "route-kw-direct-001", "input_query": "翻译这段话", "expected_skill": "direct_agent", "actual_skill": "direct_agent", "quality_score": 5.0, "reasoning": "实际路由与期望技能完全一致,且direct_chat模式能够精准且高质量地处理“翻译”这一直接文本指令。", "evaluated": true }, { "benchmark_id": "route-kw-direct-001", "input_query": "翻译这段话", "expected_skill": "direct_agent", "actual_skill": "direct_agent", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "route-kw-direct-001", "input_query": "翻译这段话", "expected_skill": "direct_agent", "actual_skill": "direct_agent", "quality_score": 5.0, "reasoning": "实际路由与期望技能完全一致,direct_agent能够精准处理翻译这一直接指令,执行模式完全匹配用户意图。", "evaluated": true }, { "benchmark_id": "route-kw-direct-001", "input_query": "翻译这段话", "expected_skill": "direct_agent", "actual_skill": "direct_agent", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "route-kw-direct-002", "input_query": "帮我总结一下", "expected_skill": "direct_agent", "actual_skill": "direct_agent", "quality_score": 5.0, "reasoning": "路由精准匹配期望技能,direct_agent能够妥善处理此类缺乏具体上下文的模糊总结指令(如引导追问或基于历史对话进行总结)。", "evaluated": true }, { "benchmark_id": "route-kw-direct-002", "input_query": "帮我总结一下", "expected_skill": "direct_agent", "actual_skill": "direct_agent", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "route-kw-direct-002", "input_query": "帮我总结一下", "expected_skill": "direct_agent", "actual_skill": "direct_agent", "quality_score": 5.0, "reasoning": "用户输入缺乏具体上下文,路由至direct_agent进行直接对话以澄清意图或引导补充信息是完全正确且最优的处理方式。", "evaluated": true }, { "benchmark_id": "route-kw-direct-003", "input_query": "什么是RAG?", "expected_skill": "direct_agent", "actual_skill": "direct_agent", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "route-kw-direct-003", "input_query": "什么是RAG?", "expected_skill": "direct_agent", "actual_skill": "direct_agent", "quality_score": 5.0, "reasoning": "路由精准匹配期望技能,直接回答RAG概念完全符合用户意图且质量优秀。", "evaluated": true }, { "benchmark_id": "route-kw-direct-003", "input_query": "什么是RAG?", "expected_skill": "direct_agent", "actual_skill": "direct_agent", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "route-kw-react-001", "input_query": "搜索一下AI Agent市场数据", "expected_skill": "react_agent", "actual_skill": "trend_agent", "quality_score": 4.0, "reasoning": "实际路由的trend_agent在处理“市场数据”时比通用的react_agent更具针对性,能精准匹配用户获取市场趋势数据的意图。", "evaluated": true }, { "benchmark_id": "route-kw-react-002", "input_query": "帮我分析这个数据", "expected_skill": "react_agent", "actual_skill": "react_agent", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "route-kw-react-003", "input_query": "实时监控竞品动态", "expected_skill": "react_agent", "actual_skill": "competitor_analyzer", "quality_score": 5.0, "reasoning": "实际路由的competitor_analyzer比期望的通用react_agent更精准地垂直匹配了“竞品动态”这一具体意图,路由精准且执行模式合理。", "evaluated": true }, { "benchmark_id": "route-kw-react-003", "input_query": "实时监控竞品动态", "expected_skill": "react_agent", "actual_skill": "competitor_analyzer", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "route-kw-rewoo-001", "input_query": "采集A、B、C三个竞品的功能数据", "expected_skill": "rewoo_agent", "actual_skill": "competitor_analyzer", "quality_score": 0.0, "reasoning": "Evaluation error: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)", "evaluated": false }, { "benchmark_id": "route-kw-rewoo-001", "input_query": "采集A、B、C三个竞品的功能数据", "expected_skill": "rewoo_agent", "actual_skill": "rewoo_agent", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "route-kw-rewoo-001", "input_query": "采集A、B、C三个竞品的功能数据", "expected_skill": "rewoo_agent", "actual_skill": "competitor_analyzer", "quality_score": 3.0, "reasoning": "路由到了竞品相关技能,领域高度匹配,但“分析”与“采集”侧重点略有偏差,且处理多竞品采集任务时可能不如通用规划Agent完整灵活。", "evaluated": true }, { "benchmark_id": "route-kw-rewoo-001", "input_query": "采集A、B、C三个竞品的功能数据", "expected_skill": "rewoo_agent", "actual_skill": "competitor_analyzer", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "route-kw-reflex-001", "input_query": "审查这段代码的合规性", "expected_skill": "reflexion_agent", "actual_skill": "code_reviewer", "quality_score": 4.0, "reasoning": "实际路由code_reviewer高度契合审查代码的意图,虽与期望的reflexion_agent不同,但路由准确且直接有效。", "evaluated": true }, { "benchmark_id": "route-kw-reflex-001", "input_query": "审查这段代码的合规性", "expected_skill": "reflexion_agent", "actual_skill": "code_reviewer", "quality_score": 0.0, "reasoning": "Evaluation error: Event loop is closed", "evaluated": false }, { "benchmark_id": "route-kw-reflex-002", "input_query": "生成一个高精度的数据分析脚本", "expected_skill": "reflexion_agent", "actual_skill": "direct_agent", "quality_score": 0.0, "reasoning": "Evaluation error: Invalid \\escape: line 1 column 82 (char 81)", "evaluated": false } ] }