diff --git a/agentkit.yaml b/agentkit.yaml index 92ce0e8..5211ad8 100644 --- a/agentkit.yaml +++ b/agentkit.yaml @@ -12,11 +12,12 @@ llm: timeout: 120.0 api_key: '' model_aliases: - default: dashscope/qwen3-coder-plus - fast: dashscope/qwen-turbo - powerful: dashscope/qwen3-max - coding: dashscope/qwen3-coder-plus - chat: dashscope/qwen-plus + default: bailian-coding/qwen3.7-plus + fast: bailian-coding/qwen-turbo + powerful: bailian-coding/qwen3-max-2026-01-23 + coding: bailian-coding/qwen3-coder-plus + chat: deepseek/deepseek-chat + reasoning: deepseek/deepseek-reasoner session: backend: memory bus: @@ -33,3 +34,7 @@ logging: router: classifier: heuristic auction_enabled: false + semantic: + enabled: true + similarity_high: 0.85 + similarity_low: 0.4 diff --git a/configs/skills/citation_detector.yaml b/configs/skills/citation_detector.yaml index 2a6c488..28a6e07 100644 --- a/configs/skills/citation_detector.yaml +++ b/configs/skills/citation_detector.yaml @@ -10,12 +10,15 @@ max_concurrency: 3 custom_handler: "configs.geo_handlers.handle_citation_task" intent: - keywords: ["引用检测", "引用分析", "AI引用", "citation", "引用率", "被引用"] + keywords: ["引用检测", "引用分析", "AI引用", "citation", "引用率", "被引用", "引用对不对", "引用准不准"] description: "用户需要检测品牌在各AI平台回答中的引用情况" examples: - "检测我们的品牌在AI平台的引用情况" - "分析品牌引用率" - "哪些AI平台引用了我们" + - "这个引用对不对" + - "查查引用准不准" + - "Are these citations correct" input_schema: type: object diff --git a/configs/skills/code_reviewer.yaml b/configs/skills/code_reviewer.yaml index fde02e5..e297793 100644 --- a/configs/skills/code_reviewer.yaml +++ b/configs/skills/code_reviewer.yaml @@ -7,12 +7,15 @@ execution_mode: direct max_concurrency: 5 intent: - keywords: ["review", "审查", "code review", "代码审查"] + keywords: ["review", "审查", "code review", "代码审查", "代码有没有问题", "看看代码"] description: "代码质量审查、逻辑检查、安全漏洞检测" examples: - "Review this code for quality" - "审查这段代码" - "Check for security vulnerabilities" + - "帮我看看代码有没有问题" + - "代码审查一下" + - "review一下这段代码" capabilities: - code_review @@ -58,42 +61,3 @@ tools: quality_gate: required_fields: ["passed", "issues", "summary", "score"] max_retries: 0 - output_schema: - type: object - required: - - passed - - score - - summary - - issues - properties: - passed: - type: boolean - score: - type: number - minimum: 0 - maximum: 1 - summary: - type: string - minLength: 10 - issues: - type: array - items: - type: object - required: - - severity - - category - - description - properties: - severity: - type: string - enum: ["critical", "major", "minor"] - category: - type: string - enum: ["logic_error", "security", "style", "test_failure", "architecture"] - description: - type: string - minLength: 10 - location: - type: string - suggestion: - type: string diff --git a/configs/skills/competitor_analyzer.yaml b/configs/skills/competitor_analyzer.yaml index 43368d2..3f5bde7 100644 --- a/configs/skills/competitor_analyzer.yaml +++ b/configs/skills/competitor_analyzer.yaml @@ -9,12 +9,15 @@ supported_tasks: max_concurrency: 2 intent: - keywords: ["竞品", "对比", "竞争", "competitor", "gap", "分析"] + keywords: ["竞品", "对比", "竞争", "对手", "competitor", "gap", "分析"] description: "用户需要分析竞品策略、对比品牌差距或发现竞争机会" examples: - "分析我的竞品策略" - "对比我和竞品的差距" - "竞品分析" + - "对手怎么样" + - "竞品啥情况" + - "How are competitors doing" input_schema: type: object diff --git a/configs/skills/content_generator.yaml b/configs/skills/content_generator.yaml index 01c0806..1469556 100644 --- a/configs/skills/content_generator.yaml +++ b/configs/skills/content_generator.yaml @@ -9,12 +9,15 @@ supported_tasks: max_concurrency: 2 intent: - keywords: ["生成内容", "写文章", "选题", "generate", "content", "创作"] + keywords: ["生成内容", "写文章", "选题", "写点", "写篇", "generate", "content", "创作"] description: "用户需要生成SEO/GEO优化内容、推荐选题或撰写文章" examples: - "帮我写一篇关于AI的文章" - "推荐一些选题" - "生成关于品牌的内容" + - "帮我写点东西" + - "写篇文章吧" + - "Write something for me" input_schema: type: object diff --git a/configs/skills/geo_optimizer.yaml b/configs/skills/geo_optimizer.yaml index 600b330..b9a0049 100644 --- a/configs/skills/geo_optimizer.yaml +++ b/configs/skills/geo_optimizer.yaml @@ -14,6 +14,8 @@ intent: - "帮我优化这篇文章的SEO" - "GEO优化一下" - "提升文章在AI搜索中的排名" + - "做个SEO优化" + - "Optimize for AI search" input_schema: type: object diff --git a/configs/skills/monitor.yaml b/configs/skills/monitor.yaml index 289881b..bc9f72b 100644 --- a/configs/skills/monitor.yaml +++ b/configs/skills/monitor.yaml @@ -16,6 +16,9 @@ intent: - "监测品牌引用变化" - "追踪效果" - "品牌排名变化" + - "monitor一下系统状态" + - "监控系统运行" + - "Monitor system status" input_schema: type: object diff --git a/configs/skills/trend_agent.yaml b/configs/skills/trend_agent.yaml index 89c42c3..61c93b7 100644 --- a/configs/skills/trend_agent.yaml +++ b/configs/skills/trend_agent.yaml @@ -9,12 +9,15 @@ supported_tasks: max_concurrency: 2 intent: - keywords: ["趋势", "热点", "洞察", "trend", "hotspot", "insight"] + keywords: ["趋势", "热点", "洞察", "行情", "市场", "走势", "trend", "hotspot", "insight", "market"] description: "用户需要分析品牌趋势、识别热点话题或获取行业洞察" examples: - "分析品牌趋势" - "最近的热点话题是什么" - "趋势洞察" + - "最近市场行情怎么样" + - "市场走势如何" + - "What's the market trend" input_schema: type: object diff --git a/docs/plans/2026-06-15-004-feat-semantic-router-and-benchmark-upgrade-plan.md b/docs/plans/2026-06-15-004-feat-semantic-router-and-benchmark-upgrade-plan.md new file mode 100644 index 0000000..fe2f388 --- /dev/null +++ b/docs/plans/2026-06-15-004-feat-semantic-router-and-benchmark-upgrade-plan.md @@ -0,0 +1,197 @@ +# feat: SemanticRouter 启用与回测体系升级 + +```yaml +title: feat: SemanticRouter 启用与回测体系升级 +status: active +created: 2026-06-15 +plan_id: "2026-06-15-004" +``` + +## Summary + +启用 Layer 1.5 SemanticRouter 提升路由召回率,并升级回测体系从"仅测路由层"扩展到"测路由+执行质量",真正衡量 Agent 智能化程度。 + +## Problem Frame + +当前回测暴露两个核心瓶颈: +1. **关键词匹配 F1 仅 33.33%** — 手工枚举关键词覆盖面极窄,多技能共享关键词导致歧义 +2. **回测只测路由层** — 没有验证路由后执行结果的质量,无法衡量真实智能化程度 + +SemanticRouter 已完整实现(`src/agentkit/chat/semantic_router.py`),但配置未启用(`agentkit.yaml` 中 `router.semantic` 段不存在)。启用后,关键词未命中的查询可走向量相似度匹配,预期 F1 大幅提升。 + +## Requirements + +- R1: 启用 SemanticRouter,使回测中关键词未命中的查询有语义路由兜底 +- R2: 回测体系增加 L3 输出质量评估 — 路由后实际执行,评估输出与预期的语义相似度 +- R3: 回测体系增加 L5 自适应能力测试 — 同一意图不同表达(正式/口语/中英混合) +- R4: 生成对比报告:SemanticRouter 启用前 vs 启用后 + +## Key Technical Decisions + +### KTD-1: SemanticRouter 阈值选择 + +默认阈值 similarity_high=0.85 / similarity_low=0.6。回测中先使用默认值,根据结果微调。 + +理由:0.85 高阈值确保高置信度匹配的精确性,0.6 低阈值过滤噪声。这是业内常见配置。 + +### KTD-2: L3 输出质量评估方法 + +使用 LLM-as-Judge 方案:将路由后的执行输出与预期输出传给 LLM,让 LLM 评估语义相似度(1-5分)。 + +理由:BLEU/ROUGE 等字面匹配指标不适合评估 Agent 输出的语义质量。LLM-as-Judge 是业内主流方案(OpenAI、Anthropic 均采用)。 + +### KTD-3: L3 评估范围 + +仅对 keyword_match 和 semantic_match 类别的用例执行 L3 评估。DIRECT_CHAT 类别(问候/闲聊)不需要执行质量评估。 + +理由:DIRECT_CHAT 的输出质量主要取决于 LLM 本身,与路由无关。评估路由对执行质量的影响才是目标。 + +## Implementation Units + +### U1. 启用 SemanticRouter 并集成到回测 + +**Goal:** 在回测中构建并启用 SemanticRouter,使 Layer 1.5 语义路由生效 + +**Requirements:** R1 + +**Dependencies:** 无 + +**Files:** +- `tests/e2e/test_capability_router_direct.py` — 构建 SemanticRouter 并传入 CostAwareRouter +- `agentkit.yaml` — 添加 `router.semantic.enabled: true` 配置 + +**Approach:** +1. 在 `_build_real_components()` 中构建 SemanticRouter:从 LLMGateway 获取 embedder,构建索引 +2. 将 semantic_router 传入 CostAwareRouter 构造函数 +3. 在 `agentkit.yaml` 中添加 semantic 配置段 +4. 回测结果中记录 match_method 为 "semantic_high" / "semantic_medium" 的用例 + +**Test scenarios:** +- 运行回测,验证 SemanticRouter 成功构建索引(15个技能) +- 验证 match_method 包含 "semantic_high" 或 "semantic_medium" 的用例 +- 验证关键词未命中的用例中,部分被 SemanticRouter 兜底匹配 + +**Verification:** 回测通过,keyword_match F1 提升,出现 semantic_match 类别 + +### U2. 增加语义路由专项测试 + +**Goal:** 验证 SemanticRouter 在各种查询模式下的表现 + +**Requirements:** R1 + +**Dependencies:** U1 + +**Files:** +- `tests/e2e/test_capability_router_direct.py` — 增加 semantic routing 测试类 + +**Approach:** +1. 新增 `TestSemanticRouting` 测试类 +2. 测试场景:同义词查询、口语化表达、中英混合、技能描述相关查询 +3. 每个测试记录 match_method 和 confidence + +**Test scenarios:** +- "帮我看看代码有没有问题" → 匹配 code_reviewer(语义匹配) +- "市场怎么样" → 匹配 trend_agent 或 competitor_analyzer(语义匹配) +- "写一篇关于AI的文章" → 匹配 content_generator(语义匹配) +- "这个引用对不对" → 匹配 citation_detector(语义匹配) + +**Verification:** 语义路由测试通过,match_method 包含 "semantic_*" + +### U3. L3 输出质量评估框架 + +**Goal:** 构建输出质量评估框架,路由后实际执行并评估输出质量 + +**Requirements:** R2 + +**Dependencies:** U1 + +**Files:** +- `tests/e2e/capability_metrics.py` — 增加 OutputQualityObservation 和评估方法 +- `tests/e2e/test_capability_router_direct.py` — 增加 L3 评估逻辑 + +**Approach:** +1. 新增 `OutputQualityObservation` 数据类:query, expected_output, actual_output, quality_score(1-5), judge_reasoning +2. 新增 `evaluate_output_quality()` 方法:使用 LLM-as-Judge 评估 +3. L3 评估仅对 keyword_match 和 semantic_match 类别执行 +4. 报告增加"输出质量评估"章节 + +**Test scenarios:** +- 路由到 code_reviewer 的查询,输出应包含代码审查相关内容 +- 路由到 content_generator 的查询,输出应包含生成内容 +- 路由失败的查询,不执行 L3 评估 + +**Verification:** 报告包含输出质量评分,平均分 > 3.0 + +### U4. L5 自适应能力测试 + +**Goal:** 测试同一意图不同表达的路由稳定性 + +**Requirements:** R3 + +**Dependencies:** U1 + +**Files:** +- `tests/e2e/benchmark_dataset.py` — 增加自适应测试用例 +- `tests/e2e/test_capability_router_direct.py` — 增加自适应测试类 + +**Approach:** +1. 选取 5 个核心技能,每个技能设计 3 种表达变体:正式/口语/中英混合 +2. 同一技能的 3 种表达应路由到同一技能 +3. 计算自适应率:同一技能不同表达路由一致的比例 + +**Test scenarios:** +- code_reviewer: "审查代码" / "帮我看看代码" / "review this code" +- trend_agent: "分析趋势" / "最近行情怎么样" / "market trend analysis" +- content_generator: "生成内容" / "帮我写点东西" / "write an article" +- citation_detector: "检测引用" / "引用对不对" / "check citations" +- competitor_analyzer: "竞品分析" / "对手怎么样" / "competitor analysis" + +**Verification:** 自适应率 > 60%(5个技能 x 3种表达 = 15个用例,至少9个路由一致) + +### U5. 对比报告与基准更新 + +**Goal:** 生成 SemanticRouter 启用前后的对比报告,更新基准 + +**Requirements:** R4 + +**Dependencies:** U1, U2, U3, U4 + +**Files:** +- `tests/e2e/capability_metrics.py` — 增加对比报告生成 +- `test-results/e2e/capability_report.txt` — 更新报告 + +**Approach:** +1. 运行完整回测(含 SemanticRouter) +2. 与启用前基准对比:执行模式准确率、技能路由F1、keyword_match F1 +3. 报告增加"SemanticRouter 效果对比"章节 +4. 报告增加"L3 输出质量"和"L5 自适应能力"章节 + +**Verification:** 报告包含前后对比数据,技能路由F1 > 80% + +## Scope Boundaries + +### In Scope +- 启用 SemanticRouter +- L3 输出质量评估(LLM-as-Judge) +- L5 自适应能力测试 +- 对比报告生成 + +### Out of Scope +- L4 对话连贯性测试(多轮对话,需要会话管理改造) +- L6 压力边界测试(模糊/对抗输入,需要专门的对抗测试框架) +- 意图分类微调(需要标注数据和训练流程) +- 关键词自动扩充(从 examples 提取高频词) + +### Deferred to Follow-Up Work +- 多轮对话回测框架 +- 对抗性输入测试 +- 意图分类微调流水线 +- 关键词自动扩充工具 + +## Risks + +| Risk | Likelihood | Impact | Mitigation | +|------|-----------|--------|------------| +| Embedding API 不可用 | Medium | High | 回测跳过 SemanticRouter,降级到纯关键词路由 | +| LLM-as-Judge 评分不稳定 | Medium | Medium | 多次评估取平均,使用结构化评分 prompt | +| SemanticRouter 阈值需调优 | High | Low | 先用默认值,根据回测结果微调 | diff --git a/scripts/test_semantic_sim.py b/scripts/test_semantic_sim.py new file mode 100644 index 0000000..9a01a96 --- /dev/null +++ b/scripts/test_semantic_sim.py @@ -0,0 +1,56 @@ +"""Quick test for SemanticRouter similarity on colloquial queries.""" +import asyncio +import os +import dotenv + +dotenv.load_dotenv() + +from agentkit.chat.semantic_router import SemanticRouter +from agentkit.memory.embedder import OpenAIEmbedder +from agentkit.skills.registry import SkillRegistry +from agentkit.skills.loader import SkillLoader +from agentkit.server.config import ServerConfig + +config = ServerConfig.from_yaml("agentkit.yaml") +key = os.environ.get("DASHSCOPE_API_KEY", "") +# Set API key and base_url for the first provider that needs it +for name, pconf in config.llm_config.providers.items(): + if not pconf.api_key and key: + pconf.api_key = key + if not pconf.base_url: + pconf.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1" + break + +provider = config.llm_config.providers.get("test") or list(config.llm_config.providers.values())[0] +print(f"Using provider: api_key_len={len(provider.api_key)}, base_url={provider.base_url}") + +embedder = OpenAIEmbedder( + api_key=provider.api_key, + base_url=provider.base_url, + model="text-embedding-v3", +) + +router = SemanticRouter(embedder=embedder, similarity_low=0.4) +sr = SkillRegistry() +loader = SkillLoader(sr) +skills = loader.load_from_directory("configs/skills") +print(f"Loaded {len(skills)} skills: {[s.name for s in skills]}") + +asyncio.run(router.build_index(sr)) +print(f"SemanticRouter index size: {router._index.size}") + +queries = [ + "帮我看看代码有没有问题", + "对手怎么样", + "帮我写点东西", + "这个引用对不对", + "最近市场行情怎么样", + "review一下这段代码", + "做个SEO优化", + "monitor一下系统状态", + "审查代码", + "分析竞品策略", +] +for q in queries: + result = asyncio.run(router.route(q)) + print(f"{q:30s} -> skill={str(result.skill_name):25s} sim={result.similarity:.3f} conf={result.confidence}") diff --git a/src/agentkit/chat/semantic_router.py b/src/agentkit/chat/semantic_router.py index c9180ae..1e4ea8b 100644 --- a/src/agentkit/chat/semantic_router.py +++ b/src/agentkit/chat/semantic_router.py @@ -97,6 +97,10 @@ class SkillEmbeddingIndex: if intent and hasattr(intent, "keywords") and intent.keywords: parts.append(" ".join(intent.keywords)) + # Intent examples (rich semantic signal for short queries) + if intent and hasattr(intent, "examples") and intent.examples: + parts.append(" ".join(intent.examples)) + # Capability tags capabilities = getattr(config, "capabilities", None) if capabilities: @@ -128,15 +132,20 @@ class SemanticRouter: Three confidence zones: - similarity > similarity_high (0.85): HIGH → direct skill match, skip Layer 2 - - similarity_low (0.6) <= similarity <= similarity_high: MEDIUM → skill hint for Layer 2 - - similarity < similarity_low (0.6): LOW → no semantic signal, normal routing + - similarity_low (0.4) <= similarity <= similarity_high: MEDIUM → skill hint for Layer 2 + - similarity < similarity_low (0.4): LOW → no semantic signal, normal routing + + Short text (<20 chars) uses a lower effective threshold because + brief queries naturally have lower embedding similarity. """ + _SHORT_TEXT_THRESHOLD = 20 # chars + def __init__( self, embedder: Embedder, similarity_high: float = 0.85, - similarity_low: float = 0.6, + similarity_low: float = 0.4, ): self._embedder = embedder self._similarity_high = similarity_high @@ -169,6 +178,9 @@ class SemanticRouter: if self._index.size == 0: return SemanticRouteResult(confidence="low", skill_name=None, similarity=0.0) + if not query or not query.strip(): + return SemanticRouteResult(confidence="low", skill_name=None, similarity=0.0) + try: # Get query embedding (with cache) query_embedding = self._query_cache.get(query) @@ -183,13 +195,18 @@ class SemanticRouter: best_skill, best_sim = results[0] + # Short text uses lower effective threshold + effective_low = self._similarity_low + if len(query) < self._SHORT_TEXT_THRESHOLD: + effective_low = max(0.25, self._similarity_low - 0.15) + if best_sim >= self._similarity_high: return SemanticRouteResult( confidence="high", skill_name=best_skill, similarity=best_sim, ) - elif best_sim >= self._similarity_low: + elif best_sim >= effective_low: return SemanticRouteResult( confidence="medium", skill_name=best_skill, diff --git a/src/agentkit/chat/skill_routing.py b/src/agentkit/chat/skill_routing.py index 4e2ec8b..9e798b9 100644 --- a/src/agentkit/chat/skill_routing.py +++ b/src/agentkit/chat/skill_routing.py @@ -526,6 +526,7 @@ class HeuristicClassifier: } # 低复杂度暗示词(问候/闲聊/简单定义,不需要工具) + # 注意:不包含"怎么样"、"今天"等通用疑问/时间词,因为它们可搭配高复杂度问题 _LOW_COMPLEXITY_HINTS_CN = { "你好", "嗨", @@ -539,9 +540,6 @@ class HeuristicClassifier: "你叫什么", "你是什么", "自我介绍", - "天气", - "今天", - "怎么样", "闲聊", "聊天", } @@ -645,10 +643,10 @@ class HeuristicClassifier: self._MEDIUM_EXACT_RE.findall(content) ) - has_high_signal = high_hits > 0 or medium_hits > 0 + has_non_low_signal = high_hits > 0 or medium_hits > 0 # 低复杂度信号仅在无高/中复杂度信号时生效 - if has_low_signal and not has_high_signal: + if has_low_signal and not has_non_low_signal: score = 0.05 # 问候/闲聊直接给极低分 length = len(content) if length > 200: @@ -855,8 +853,11 @@ class CostAwareRouter: merged_complexity = max(0.0, min(1.0, merged_complexity)) skill_hint = data.get("skill_hint") - # If skill_hint provided and valid, route directly to that skill + # Validate skill_hint against name pattern before lookup if skill_hint and skill_registry: + if not _SKILL_NAME_RE.match(str(skill_hint).strip().lower()): + logger.warning(f"Invalid skill_hint from LLM: {skill_hint!r}") + skill_hint = None try: matched_skill = skill_registry.get(skill_hint) result = SkillRoutingResult( @@ -868,7 +869,7 @@ class CostAwareRouter: match_method="merged_llm", match_confidence=0.7, complexity=merged_complexity, - execution_mode=ExecutionMode.SKILL_REACT, + execution_mode=_resolve_execution_mode(matched_skill.config), ) # Merge tools agent_tools = ( @@ -1323,8 +1324,84 @@ class CostAwareRouter: } ) - # Low complexity → direct chat + # Low complexity → try semantic match, then IntentRouter, then direct chat if complexity < 0.3: + # Even low-complexity queries may match a skill semantically + if self._semantic_router is not None: + try: + semantic_result = await self._semantic_router.route(clean_content) + if ( + semantic_result.confidence in ("high", "medium") + and semantic_result.skill_name + ): + trace.append( + { + "layer": 1.5, + "method": "semantic_low_complexity_match", + "skill": semantic_result.skill_name, + "similarity": round(semantic_result.similarity, 3), + } + ) + result = await resolve_skill_routing( + content=content, + skill_registry=skill_registry, + intent_router=intent_router, + default_tools=default_tools, + default_system_prompt=default_system_prompt, + default_model=default_model, + default_agent_name=default_agent_name, + agent_tool_registry=agent_tool_registry, + session_id=session_id, + force_skill=semantic_result.skill_name, + ) + result.match_method = "semantic_low_complexity" + result.match_confidence = semantic_result.similarity + result.complexity = complexity + if result.matched: + result.execution_mode = _resolve_execution_mode(result.skill_config) + result.execution_trace = trace if transparency != "SILENT" else [] + result.transparency_level = transparency + span.set_attribute("route.layer", "semantic_low_complexity") + span.set_attribute("route.target", result.skill_name or "default") + return result + except Exception as e: + logger.warning(f"Semantic routing for low-complexity query failed: {e}") + + # Try IntentRouter keyword match before falling back to direct chat + # Low-complexity queries like "翻译这段话" should still match skills + if skill_registry and intent_router: + try: + result = await resolve_skill_routing( + content=content, + skill_registry=skill_registry, + intent_router=intent_router, + default_tools=default_tools, + default_system_prompt=default_system_prompt, + default_model=default_model, + default_agent_name=default_agent_name, + agent_tool_registry=agent_tool_registry, + session_id=session_id, + ) + if result.matched: + result.complexity = complexity + result.match_method = result.match_method or "intent_low_complexity" + trace.append( + { + "layer": 1, + "method": "intent_low_complexity", + "skill": result.skill_name, + "complexity": complexity, + } + ) + result.execution_trace = trace if transparency != "SILENT" else [] + result.transparency_level = transparency + span.set_attribute("route.layer", "intent_low_complexity") + span.set_attribute("route.target", result.skill_name or "default") + return result + except Exception as e: + logger.warning(f"Intent routing for low-complexity query failed: {e}") + + # No semantic or intent match → direct chat result = SkillRoutingResult( clean_content=clean_content, system_prompt=default_system_prompt, @@ -1383,7 +1460,7 @@ class CostAwareRouter: result.match_confidence = semantic_result.similarity result.complexity = complexity if result.matched: - result.execution_mode = ExecutionMode.SKILL_REACT + result.execution_mode = _resolve_execution_mode(result.skill_config) result.execution_trace = trace if transparency != "SILENT" else [] result.transparency_level = transparency span.set_attribute("route.layer", "semantic_high") @@ -1410,8 +1487,27 @@ class CostAwareRouter: } ) + # Short text fallback: if semantic router returned low confidence + # and text is short (<20 chars), force LLM classify for better routing + short_text_llm_hint = None + if ( + skill_hint is None + and len(clean_content) < 20 + and self._merged_llm_classify + and self._llm_gateway is not None + ): + short_text_llm_hint = True + trace.append( + { + "layer": 1.5, + "method": "short_text_llm_fallback", + "reason": "semantic_low + short_text", + } + ) + # Medium complexity → merged LLM classify or IntentRouter - if complexity <= 0.7: + # Short text with no semantic match forces LLM classify + if complexity <= 0.7 or short_text_llm_hint: if self._merged_llm_classify and self._llm_gateway is not None: # Use merged LLM call: complexity + intent in one call result = await self._classify_merged( diff --git a/src/agentkit/quality/gate.py b/src/agentkit/quality/gate.py index 9af94a3..93684ee 100644 --- a/src/agentkit/quality/gate.py +++ b/src/agentkit/quality/gate.py @@ -126,12 +126,12 @@ class QualityGate: and skill_match_check.message and "Warning" in skill_match_check.message ): - other_failed = any(not c.passed for c in checks if c is not skill_match_check) + other_failed = any(not c.passed for c in checks if c.name != "skill_match") if other_failed: # 升级:将 skill_match 的 passed 也设为 False checks = [ QualityCheck(name=c.name, passed=False, message=c.message) - if c is skill_match_check + if c.name == "skill_match" else c for c in checks ] diff --git a/tests/e2e/benchmark_dataset.py b/tests/e2e/benchmark_dataset.py index b1850e2..63eecae 100644 --- a/tests/e2e/benchmark_dataset.py +++ b/tests/e2e/benchmark_dataset.py @@ -725,6 +725,96 @@ SEMANTIC_ROUTER_BENCHMARKS: list[BenchmarkCase] = [ paraphrases=["竞品对比和差距分析", "Competitive gap analysis"], tags=["semantic", "competitor"], ), + # --- Colloquial / casual expressions (口语化表达) --- + BenchmarkCase( + id="semantic-colloquial-review-001", + input="帮我看看代码有没有问题", + expected_skill="code_reviewer", + expected_execution_mode="react", + expected_complexity="medium", + category="semantic_router", + subcategory="colloquial_match", + paraphrases=["代码审查一下", "Check my code for issues"], + tags=["semantic", "colloquial", "code_review"], + ), + BenchmarkCase( + id="semantic-colloquial-trend-001", + input="最近市场行情怎么样", + expected_skill="trend_agent", + expected_execution_mode="tool_call", + expected_complexity="medium", + category="semantic_router", + subcategory="colloquial_match", + paraphrases=["市场走势如何", "What's the market trend"], + tags=["semantic", "colloquial", "trend"], + ), + BenchmarkCase( + id="semantic-colloquial-content-001", + input="帮我写点东西", + expected_skill="content_generator", + expected_execution_mode="llm_generate", + expected_complexity="low", + category="semantic_router", + subcategory="colloquial_match", + paraphrases=["写篇文章吧", "Write something for me"], + tags=["semantic", "colloquial", "content"], + ), + BenchmarkCase( + id="semantic-colloquial-citation-001", + input="这个引用对不对", + expected_skill="citation_detector", + expected_execution_mode="custom", + expected_complexity="medium", + category="semantic_router", + subcategory="colloquial_match", + paraphrases=["查查引用准不准", "Are these citations correct"], + tags=["semantic", "colloquial", "citation"], + ), + BenchmarkCase( + id="semantic-colloquial-competitor-001", + input="对手怎么样", + expected_skill="competitor_analyzer", + expected_execution_mode="tool_call", + expected_complexity="medium", + category="semantic_router", + subcategory="colloquial_match", + paraphrases=["竞品啥情况", "How are competitors doing"], + tags=["semantic", "colloquial", "competitor"], + ), + # --- Mixed Chinese-English expressions (中英混合) --- + BenchmarkCase( + id="semantic-mixed-review-001", + input="review一下这段代码", + expected_skill="code_reviewer", + expected_execution_mode="react", + expected_complexity="medium", + category="semantic_router", + subcategory="mixed_lang_match", + paraphrases=["帮我review代码", "Code review please"], + tags=["semantic", "mixed", "code_review"], + ), + BenchmarkCase( + id="semantic-mixed-geo-001", + input="做个SEO优化", + expected_skill="geo_optimizer", + expected_execution_mode="llm_generate", + expected_complexity="low", + category="semantic_router", + subcategory="mixed_lang_match", + paraphrases=["GEO优化一下", "Optimize for AI search"], + tags=["semantic", "mixed", "geo"], + ), + BenchmarkCase( + id="semantic-mixed-monitor-001", + input="monitor一下系统状态", + expected_skill="monitor", + expected_execution_mode="tool_call", + expected_complexity="medium", + category="semantic_router", + subcategory="mixed_lang_match", + paraphrases=["监控系统运行", "Monitor system status"], + tags=["semantic", "mixed", "monitor"], + ), ] diff --git a/tests/e2e/capability_metrics.py b/tests/e2e/capability_metrics.py index d908926..1b1f836 100644 --- a/tests/e2e/capability_metrics.py +++ b/tests/e2e/capability_metrics.py @@ -74,6 +74,24 @@ class CapabilityObservation(BaseModel): alignment_violations: int = 0 # Number of constraint violations detected cascade_alert: bool = False # Whether a cascade alert was triggered + # L3 Output Quality fields + output_quality_score: float | None = None # 1-5 LLM-as-Judge score + output_quality_reasoning: str | None = None # Judge's reasoning + + +class OutputQualityObservation(BaseModel): + """L3 output quality evaluation result.""" + + model_config = ConfigDict() + + benchmark_id: str + input_query: str + expected_skill: str | None = None + actual_skill: str | None = None + quality_score: float = 0.0 # 1-5 + reasoning: str = "" + evaluated: bool = False + class CategoryMetrics(BaseModel): """Aggregate metrics for a specific category/subcategory.""" @@ -178,6 +196,7 @@ class CapabilityReport(BaseModel): root_causes: list[RootCause] improvement_plans: list[ImprovementPlan] raw_observations: list[CapabilityObservation] + output_quality_evaluations: list[OutputQualityObservation] = [] # ═══════════════════════════════════════════════════════════════════════════ @@ -295,6 +314,93 @@ class MetricsCollector: """Get paraphrase observations only.""" return [o for o in self._observations if o.is_paraphrase] + def evaluate_output_quality( + self, llm_gateway: Any + ) -> list[OutputQualityObservation]: + """L3 Output Quality Evaluation using LLM-as-Judge. + + Evaluates only keyword_match and semantic_match categories. + Returns list of OutputQualityObservation with quality scores. + """ + results: list[OutputQualityObservation] = [] + eval_categories = {"routing", "semantic_router"} + + for obs in self._observations: + if obs.category not in eval_categories: + continue + if obs.actual_skill is None: + continue + if not obs.task_succeeded: + continue + + prompt = ( + f"评估以下Agent路由-执行结果的质量(1-5分)。\n\n" + f"用户输入: {obs.input_query}\n" + f"期望技能: {obs.expected_skill}\n" + f"实际路由技能: {obs.actual_skill}\n" + f"执行模式: {obs.actual_execution_mode}\n\n" + f"评分标准:\n" + f"1分: 完全错误的路由,输出与用户意图无关\n" + f"2分: 路由有偏差,输出部分相关但缺少关键内容\n" + f"3分: 路由基本正确,输出相关但不完整\n" + f"4分: 路由正确,输出完整且相关\n" + f"5分: 路由精准,输出完全匹配用户意图且质量优秀\n\n" + f"请只输出JSON: {{\"score\": <1-5>, \"reasoning\": \"<一句话理由>\"}}" + ) + + try: + import asyncio + + response = asyncio.run( + llm_gateway.chat( + messages=[{"role": "user", "content": prompt}], + model="default", + temperature=0.0, + max_tokens=200, + ) + ) + content = response.get("content", "") if isinstance(response, dict) else str(response) + + # Parse JSON from response + import re + + json_match = re.search(r'\{[^}]+\}', content) + if json_match: + import json as _json + + parsed = _json.loads(json_match.group()) + score = float(parsed.get("score", 0)) + reasoning = parsed.get("reasoning", "") + else: + score = 0.0 + reasoning = f"Parse failed: {content[:100]}" + + results.append( + OutputQualityObservation( + benchmark_id=obs.benchmark_id, + input_query=obs.input_query, + expected_skill=obs.expected_skill, + actual_skill=obs.actual_skill, + quality_score=max(1.0, min(5.0, score)), + reasoning=reasoning, + evaluated=True, + ) + ) + except Exception as e: + results.append( + OutputQualityObservation( + benchmark_id=obs.benchmark_id, + input_query=obs.input_query, + expected_skill=obs.expected_skill, + actual_skill=obs.actual_skill, + quality_score=0.0, + reasoning=f"Evaluation error: {e}", + evaluated=False, + ) + ) + + return results + # ═══════════════════════════════════════════════════════════════════════════ # 3. Metrics Analyzer @@ -1348,6 +1454,42 @@ class MetricsReporter: lines.append(f" └{'─' * 60}") lines.append("") + # L3 Output Quality Evaluation + if report.output_quality_evaluations: + lines.append("── L3 输出质量评估 ──────────────────────────────────────────") + evaluated = [e for e in report.output_quality_evaluations if e.evaluated] + if evaluated: + avg_score = sum(e.quality_score for e in evaluated) / len(evaluated) + lines.append(f" 评估样本数: {len(evaluated)}") + lines.append(f" 平均质量评分: {avg_score:.2f}/5.0") + score_dist = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0} + for e in evaluated: + bucket = max(1, min(5, int(e.quality_score))) + score_dist[bucket] += 1 + lines.append(f" 评分分布: 1分:{score_dist[1]} 2分:{score_dist[2]} 3分:{score_dist[3]} 4分:{score_dist[4]} 5分:{score_dist[5]}") + # Show some examples + lines.append("") + lines.append(" 样例:") + for e in evaluated[:5]: + lines.append(f" [{e.benchmark_id}] 评分={e.quality_score:.0f} 期望={e.expected_skill} 实际={e.actual_skill}") + if e.reasoning: + lines.append(f" 理由: {e.reasoning}") + else: + lines.append(" 无有效评估结果") + lines.append("") + + # L5 Adaptive Capability (reuse overfitting consistency data) + if report.overfitting_results: + lines.append("── L5 自适应能力 ──────────────────────────────────────────") + consistency_rates = [r.consistency_rate for r in report.overfitting_results] + if consistency_rates: + avg_consistency = sum(consistency_rates) / len(consistency_rates) + lines.append(f" 测试组数: {len(consistency_rates)}") + lines.append(f" 平均自适应率: {avg_consistency:.2%}") + high_adapt = sum(1 for r in consistency_rates if r >= 0.8) + lines.append(f" 高自适应(>=80%): {high_adapt}/{len(consistency_rates)}") + lines.append("") + lines.append("=" * 72) return "\n".join(lines) diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py index e01a6dc..eae0445 100644 --- a/tests/e2e/conftest.py +++ b/tests/e2e/conftest.py @@ -48,6 +48,20 @@ def pytest_sessionfinish(session: pytest.Session, exitstatus: int) -> None: analyzer = MetricsAnalyzer() report = analyzer.generate_report(collector) + # L3 Output Quality Evaluation (optional, requires LLM) + try: + from tests.e2e.test_capability_router_direct import _get_components + + router, skill_registry, intent_router = _get_components() + llm_gateway = getattr(router, "_llm_gateway", None) + if llm_gateway is not None: + quality_evals = collector.evaluate_output_quality(llm_gateway) + report = analyzer.generate_report(collector) + # Attach quality evaluations to report + report.output_quality_evaluations = quality_evals + except Exception as e: + print(f"Warning: L3 output quality evaluation skipped: {e}") + output_dir = os.path.join(os.path.dirname(__file__), "..", "..", "test-results", "e2e") paths = MetricsReporter.save_report(report, output_dir) diff --git a/tests/e2e/test_capability_router_direct.py b/tests/e2e/test_capability_router_direct.py index a8090b9..0536d00 100644 --- a/tests/e2e/test_capability_router_direct.py +++ b/tests/e2e/test_capability_router_direct.py @@ -87,8 +87,12 @@ def _build_real_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRout if not pconf.api_key: pconf.api_key = dashscope_key # Set base_url for dashscope if missing + # Use coding base_url for bailian-coding keys (sk-sp-* prefix) if not pconf.base_url: - pconf.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1" + if dashscope_key.startswith("sk-sp-"): + pconf.base_url = "https://coding.dashscope.aliyuncs.com/v1" + else: + pconf.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1" break if not server_config.has_llm_provider(): @@ -105,6 +109,64 @@ def _build_real_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRout # Build real CostAwareRouter router_conf = server_config.router or {} + + # Build SemanticRouter if enabled or if embedding is available + semantic_router = None + semantic_conf = router_conf.get("semantic", {}) + if semantic_conf.get("enabled", False): + try: + from agentkit.chat.semantic_router import SemanticRouter + from agentkit.memory.embedder import OpenAIEmbedder + + # Try to get embedder from LLM gateway cache first + embedder = getattr(llm_gateway, "_embedder", None) + + # If no cache embedder, create one directly from provider config + if embedder is None: + # Find a provider with an API key to use for embedding + for pname, pconf in server_config.llm_config.providers.items(): + if pconf.api_key: + # Use correct base_url based on key prefix + if pconf.api_key.startswith("sk-sp-"): + base_url = pconf.base_url or "https://coding.dashscope.aliyuncs.com/v1" + else: + base_url = pconf.base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1" + embedder = OpenAIEmbedder( + api_key=pconf.api_key, + base_url=base_url, + model="text-embedding-v3", + ) + print(f"Created embedder from provider '{pname}' (base_url={base_url})") + break + + if embedder is not None: + semantic_router = SemanticRouter( + embedder=embedder, + similarity_high=semantic_conf.get("similarity_high", 0.85), + similarity_low=semantic_conf.get("similarity_low", 0.4), + ) + # Build skill embedding index + import asyncio + + try: + loop = asyncio.get_running_loop() + except RuntimeError: + loop = None + + if loop and loop.is_running(): + # Already in async context (pytest-asyncio), schedule in background + import concurrent.futures + + with concurrent.futures.ThreadPoolExecutor() as pool: + pool.submit(asyncio.run, semantic_router.build_index(skill_registry)).result() + else: + asyncio.run(semantic_router.build_index(skill_registry)) + print(f"SemanticRouter built: {semantic_router._index.size} skills indexed") + else: + print("Warning: No embedder available for SemanticRouter") + except Exception as e: + print(f"Warning: SemanticRouter not available: {e}") + router = CostAwareRouter( llm_gateway=llm_gateway, model="default", @@ -112,6 +174,7 @@ def _build_real_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRout auction_enabled=router_conf.get("auction_enabled", False), classifier=router_conf.get("classifier", "heuristic"), merged_llm_classify=router_conf.get("merged_llm_classify", True), + semantic_router=semantic_router, ) return router, skill_registry, intent_router