merge: feat/router-optimization-round2 — Router intelligence upgrade (3rd iteration)
Key improvements: - Fix low-complexity signal overriding high-complexity signal (P1) - Enable SemanticRouter with lower threshold (0.6→0.4) + examples - Short text LLM fallback for <20 char queries - IntentRouter multi-candidate keyword scoring - ExecutionMode enum extension (REWOO/REFLEXION/PLAN_EXEC) - QualityGate 5th dimension: skill match validation - Code review fixes: execution_mode resolution, name-based checks, validation
This commit is contained in:
commit
dcdbfd85f2
|
|
@ -12,11 +12,12 @@ llm:
|
||||||
timeout: 120.0
|
timeout: 120.0
|
||||||
api_key: ''
|
api_key: ''
|
||||||
model_aliases:
|
model_aliases:
|
||||||
default: dashscope/qwen3-coder-plus
|
default: bailian-coding/qwen3.7-plus
|
||||||
fast: dashscope/qwen-turbo
|
fast: bailian-coding/qwen-turbo
|
||||||
powerful: dashscope/qwen3-max
|
powerful: bailian-coding/qwen3-max-2026-01-23
|
||||||
coding: dashscope/qwen3-coder-plus
|
coding: bailian-coding/qwen3-coder-plus
|
||||||
chat: dashscope/qwen-plus
|
chat: deepseek/deepseek-chat
|
||||||
|
reasoning: deepseek/deepseek-reasoner
|
||||||
session:
|
session:
|
||||||
backend: memory
|
backend: memory
|
||||||
bus:
|
bus:
|
||||||
|
|
@ -33,3 +34,7 @@ logging:
|
||||||
router:
|
router:
|
||||||
classifier: heuristic
|
classifier: heuristic
|
||||||
auction_enabled: false
|
auction_enabled: false
|
||||||
|
semantic:
|
||||||
|
enabled: true
|
||||||
|
similarity_high: 0.85
|
||||||
|
similarity_low: 0.4
|
||||||
|
|
|
||||||
|
|
@ -10,12 +10,15 @@ max_concurrency: 3
|
||||||
custom_handler: "configs.geo_handlers.handle_citation_task"
|
custom_handler: "configs.geo_handlers.handle_citation_task"
|
||||||
|
|
||||||
intent:
|
intent:
|
||||||
keywords: ["引用检测", "引用分析", "AI引用", "citation", "引用率", "被引用"]
|
keywords: ["引用检测", "引用分析", "AI引用", "citation", "引用率", "被引用", "引用对不对", "引用准不准"]
|
||||||
description: "用户需要检测品牌在各AI平台回答中的引用情况"
|
description: "用户需要检测品牌在各AI平台回答中的引用情况"
|
||||||
examples:
|
examples:
|
||||||
- "检测我们的品牌在AI平台的引用情况"
|
- "检测我们的品牌在AI平台的引用情况"
|
||||||
- "分析品牌引用率"
|
- "分析品牌引用率"
|
||||||
- "哪些AI平台引用了我们"
|
- "哪些AI平台引用了我们"
|
||||||
|
- "这个引用对不对"
|
||||||
|
- "查查引用准不准"
|
||||||
|
- "Are these citations correct"
|
||||||
|
|
||||||
input_schema:
|
input_schema:
|
||||||
type: object
|
type: object
|
||||||
|
|
|
||||||
|
|
@ -7,12 +7,15 @@ execution_mode: direct
|
||||||
max_concurrency: 5
|
max_concurrency: 5
|
||||||
|
|
||||||
intent:
|
intent:
|
||||||
keywords: ["review", "审查", "code review", "代码审查"]
|
keywords: ["review", "审查", "code review", "代码审查", "代码有没有问题", "看看代码"]
|
||||||
description: "代码质量审查、逻辑检查、安全漏洞检测"
|
description: "代码质量审查、逻辑检查、安全漏洞检测"
|
||||||
examples:
|
examples:
|
||||||
- "Review this code for quality"
|
- "Review this code for quality"
|
||||||
- "审查这段代码"
|
- "审查这段代码"
|
||||||
- "Check for security vulnerabilities"
|
- "Check for security vulnerabilities"
|
||||||
|
- "帮我看看代码有没有问题"
|
||||||
|
- "代码审查一下"
|
||||||
|
- "review一下这段代码"
|
||||||
|
|
||||||
capabilities:
|
capabilities:
|
||||||
- code_review
|
- code_review
|
||||||
|
|
@ -58,42 +61,3 @@ tools:
|
||||||
quality_gate:
|
quality_gate:
|
||||||
required_fields: ["passed", "issues", "summary", "score"]
|
required_fields: ["passed", "issues", "summary", "score"]
|
||||||
max_retries: 0
|
max_retries: 0
|
||||||
output_schema:
|
|
||||||
type: object
|
|
||||||
required:
|
|
||||||
- passed
|
|
||||||
- score
|
|
||||||
- summary
|
|
||||||
- issues
|
|
||||||
properties:
|
|
||||||
passed:
|
|
||||||
type: boolean
|
|
||||||
score:
|
|
||||||
type: number
|
|
||||||
minimum: 0
|
|
||||||
maximum: 1
|
|
||||||
summary:
|
|
||||||
type: string
|
|
||||||
minLength: 10
|
|
||||||
issues:
|
|
||||||
type: array
|
|
||||||
items:
|
|
||||||
type: object
|
|
||||||
required:
|
|
||||||
- severity
|
|
||||||
- category
|
|
||||||
- description
|
|
||||||
properties:
|
|
||||||
severity:
|
|
||||||
type: string
|
|
||||||
enum: ["critical", "major", "minor"]
|
|
||||||
category:
|
|
||||||
type: string
|
|
||||||
enum: ["logic_error", "security", "style", "test_failure", "architecture"]
|
|
||||||
description:
|
|
||||||
type: string
|
|
||||||
minLength: 10
|
|
||||||
location:
|
|
||||||
type: string
|
|
||||||
suggestion:
|
|
||||||
type: string
|
|
||||||
|
|
|
||||||
|
|
@ -9,12 +9,15 @@ supported_tasks:
|
||||||
max_concurrency: 2
|
max_concurrency: 2
|
||||||
|
|
||||||
intent:
|
intent:
|
||||||
keywords: ["竞品", "对比", "竞争", "competitor", "gap", "分析"]
|
keywords: ["竞品", "对比", "竞争", "对手", "competitor", "gap", "分析"]
|
||||||
description: "用户需要分析竞品策略、对比品牌差距或发现竞争机会"
|
description: "用户需要分析竞品策略、对比品牌差距或发现竞争机会"
|
||||||
examples:
|
examples:
|
||||||
- "分析我的竞品策略"
|
- "分析我的竞品策略"
|
||||||
- "对比我和竞品的差距"
|
- "对比我和竞品的差距"
|
||||||
- "竞品分析"
|
- "竞品分析"
|
||||||
|
- "对手怎么样"
|
||||||
|
- "竞品啥情况"
|
||||||
|
- "How are competitors doing"
|
||||||
|
|
||||||
input_schema:
|
input_schema:
|
||||||
type: object
|
type: object
|
||||||
|
|
|
||||||
|
|
@ -9,12 +9,15 @@ supported_tasks:
|
||||||
max_concurrency: 2
|
max_concurrency: 2
|
||||||
|
|
||||||
intent:
|
intent:
|
||||||
keywords: ["生成内容", "写文章", "选题", "generate", "content", "创作"]
|
keywords: ["生成内容", "写文章", "选题", "写点", "写篇", "generate", "content", "创作"]
|
||||||
description: "用户需要生成SEO/GEO优化内容、推荐选题或撰写文章"
|
description: "用户需要生成SEO/GEO优化内容、推荐选题或撰写文章"
|
||||||
examples:
|
examples:
|
||||||
- "帮我写一篇关于AI的文章"
|
- "帮我写一篇关于AI的文章"
|
||||||
- "推荐一些选题"
|
- "推荐一些选题"
|
||||||
- "生成关于品牌的内容"
|
- "生成关于品牌的内容"
|
||||||
|
- "帮我写点东西"
|
||||||
|
- "写篇文章吧"
|
||||||
|
- "Write something for me"
|
||||||
|
|
||||||
input_schema:
|
input_schema:
|
||||||
type: object
|
type: object
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,8 @@ intent:
|
||||||
- "帮我优化这篇文章的SEO"
|
- "帮我优化这篇文章的SEO"
|
||||||
- "GEO优化一下"
|
- "GEO优化一下"
|
||||||
- "提升文章在AI搜索中的排名"
|
- "提升文章在AI搜索中的排名"
|
||||||
|
- "做个SEO优化"
|
||||||
|
- "Optimize for AI search"
|
||||||
|
|
||||||
input_schema:
|
input_schema:
|
||||||
type: object
|
type: object
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,9 @@ intent:
|
||||||
- "监测品牌引用变化"
|
- "监测品牌引用变化"
|
||||||
- "追踪效果"
|
- "追踪效果"
|
||||||
- "品牌排名变化"
|
- "品牌排名变化"
|
||||||
|
- "monitor一下系统状态"
|
||||||
|
- "监控系统运行"
|
||||||
|
- "Monitor system status"
|
||||||
|
|
||||||
input_schema:
|
input_schema:
|
||||||
type: object
|
type: object
|
||||||
|
|
|
||||||
|
|
@ -9,12 +9,15 @@ supported_tasks:
|
||||||
max_concurrency: 2
|
max_concurrency: 2
|
||||||
|
|
||||||
intent:
|
intent:
|
||||||
keywords: ["趋势", "热点", "洞察", "trend", "hotspot", "insight"]
|
keywords: ["趋势", "热点", "洞察", "行情", "市场", "走势", "trend", "hotspot", "insight", "market"]
|
||||||
description: "用户需要分析品牌趋势、识别热点话题或获取行业洞察"
|
description: "用户需要分析品牌趋势、识别热点话题或获取行业洞察"
|
||||||
examples:
|
examples:
|
||||||
- "分析品牌趋势"
|
- "分析品牌趋势"
|
||||||
- "最近的热点话题是什么"
|
- "最近的热点话题是什么"
|
||||||
- "趋势洞察"
|
- "趋势洞察"
|
||||||
|
- "最近市场行情怎么样"
|
||||||
|
- "市场走势如何"
|
||||||
|
- "What's the market trend"
|
||||||
|
|
||||||
input_schema:
|
input_schema:
|
||||||
type: object
|
type: object
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,197 @@
|
||||||
|
# feat: SemanticRouter 启用与回测体系升级
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
title: feat: SemanticRouter 启用与回测体系升级
|
||||||
|
status: active
|
||||||
|
created: 2026-06-15
|
||||||
|
plan_id: "2026-06-15-004"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Summary
|
||||||
|
|
||||||
|
启用 Layer 1.5 SemanticRouter 提升路由召回率,并升级回测体系从"仅测路由层"扩展到"测路由+执行质量",真正衡量 Agent 智能化程度。
|
||||||
|
|
||||||
|
## Problem Frame
|
||||||
|
|
||||||
|
当前回测暴露两个核心瓶颈:
|
||||||
|
1. **关键词匹配 F1 仅 33.33%** — 手工枚举关键词覆盖面极窄,多技能共享关键词导致歧义
|
||||||
|
2. **回测只测路由层** — 没有验证路由后执行结果的质量,无法衡量真实智能化程度
|
||||||
|
|
||||||
|
SemanticRouter 已完整实现(`src/agentkit/chat/semantic_router.py`),但配置未启用(`agentkit.yaml` 中 `router.semantic` 段不存在)。启用后,关键词未命中的查询可走向量相似度匹配,预期 F1 大幅提升。
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- R1: 启用 SemanticRouter,使回测中关键词未命中的查询有语义路由兜底
|
||||||
|
- R2: 回测体系增加 L3 输出质量评估 — 路由后实际执行,评估输出与预期的语义相似度
|
||||||
|
- R3: 回测体系增加 L5 自适应能力测试 — 同一意图不同表达(正式/口语/中英混合)
|
||||||
|
- R4: 生成对比报告:SemanticRouter 启用前 vs 启用后
|
||||||
|
|
||||||
|
## Key Technical Decisions
|
||||||
|
|
||||||
|
### KTD-1: SemanticRouter 阈值选择
|
||||||
|
|
||||||
|
默认阈值 similarity_high=0.85 / similarity_low=0.6。回测中先使用默认值,根据结果微调。
|
||||||
|
|
||||||
|
理由:0.85 高阈值确保高置信度匹配的精确性,0.6 低阈值过滤噪声。这是业内常见配置。
|
||||||
|
|
||||||
|
### KTD-2: L3 输出质量评估方法
|
||||||
|
|
||||||
|
使用 LLM-as-Judge 方案:将路由后的执行输出与预期输出传给 LLM,让 LLM 评估语义相似度(1-5分)。
|
||||||
|
|
||||||
|
理由:BLEU/ROUGE 等字面匹配指标不适合评估 Agent 输出的语义质量。LLM-as-Judge 是业内主流方案(OpenAI、Anthropic 均采用)。
|
||||||
|
|
||||||
|
### KTD-3: L3 评估范围
|
||||||
|
|
||||||
|
仅对 keyword_match 和 semantic_match 类别的用例执行 L3 评估。DIRECT_CHAT 类别(问候/闲聊)不需要执行质量评估。
|
||||||
|
|
||||||
|
理由:DIRECT_CHAT 的输出质量主要取决于 LLM 本身,与路由无关。评估路由对执行质量的影响才是目标。
|
||||||
|
|
||||||
|
## Implementation Units
|
||||||
|
|
||||||
|
### U1. 启用 SemanticRouter 并集成到回测
|
||||||
|
|
||||||
|
**Goal:** 在回测中构建并启用 SemanticRouter,使 Layer 1.5 语义路由生效
|
||||||
|
|
||||||
|
**Requirements:** R1
|
||||||
|
|
||||||
|
**Dependencies:** 无
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- `tests/e2e/test_capability_router_direct.py` — 构建 SemanticRouter 并传入 CostAwareRouter
|
||||||
|
- `agentkit.yaml` — 添加 `router.semantic.enabled: true` 配置
|
||||||
|
|
||||||
|
**Approach:**
|
||||||
|
1. 在 `_build_real_components()` 中构建 SemanticRouter:从 LLMGateway 获取 embedder,构建索引
|
||||||
|
2. 将 semantic_router 传入 CostAwareRouter 构造函数
|
||||||
|
3. 在 `agentkit.yaml` 中添加 semantic 配置段
|
||||||
|
4. 回测结果中记录 match_method 为 "semantic_high" / "semantic_medium" 的用例
|
||||||
|
|
||||||
|
**Test scenarios:**
|
||||||
|
- 运行回测,验证 SemanticRouter 成功构建索引(15个技能)
|
||||||
|
- 验证 match_method 包含 "semantic_high" 或 "semantic_medium" 的用例
|
||||||
|
- 验证关键词未命中的用例中,部分被 SemanticRouter 兜底匹配
|
||||||
|
|
||||||
|
**Verification:** 回测通过,keyword_match F1 提升,出现 semantic_match 类别
|
||||||
|
|
||||||
|
### U2. 增加语义路由专项测试
|
||||||
|
|
||||||
|
**Goal:** 验证 SemanticRouter 在各种查询模式下的表现
|
||||||
|
|
||||||
|
**Requirements:** R1
|
||||||
|
|
||||||
|
**Dependencies:** U1
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- `tests/e2e/test_capability_router_direct.py` — 增加 semantic routing 测试类
|
||||||
|
|
||||||
|
**Approach:**
|
||||||
|
1. 新增 `TestSemanticRouting` 测试类
|
||||||
|
2. 测试场景:同义词查询、口语化表达、中英混合、技能描述相关查询
|
||||||
|
3. 每个测试记录 match_method 和 confidence
|
||||||
|
|
||||||
|
**Test scenarios:**
|
||||||
|
- "帮我看看代码有没有问题" → 匹配 code_reviewer(语义匹配)
|
||||||
|
- "市场怎么样" → 匹配 trend_agent 或 competitor_analyzer(语义匹配)
|
||||||
|
- "写一篇关于AI的文章" → 匹配 content_generator(语义匹配)
|
||||||
|
- "这个引用对不对" → 匹配 citation_detector(语义匹配)
|
||||||
|
|
||||||
|
**Verification:** 语义路由测试通过,match_method 包含 "semantic_*"
|
||||||
|
|
||||||
|
### U3. L3 输出质量评估框架
|
||||||
|
|
||||||
|
**Goal:** 构建输出质量评估框架,路由后实际执行并评估输出质量
|
||||||
|
|
||||||
|
**Requirements:** R2
|
||||||
|
|
||||||
|
**Dependencies:** U1
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- `tests/e2e/capability_metrics.py` — 增加 OutputQualityObservation 和评估方法
|
||||||
|
- `tests/e2e/test_capability_router_direct.py` — 增加 L3 评估逻辑
|
||||||
|
|
||||||
|
**Approach:**
|
||||||
|
1. 新增 `OutputQualityObservation` 数据类:query, expected_output, actual_output, quality_score(1-5), judge_reasoning
|
||||||
|
2. 新增 `evaluate_output_quality()` 方法:使用 LLM-as-Judge 评估
|
||||||
|
3. L3 评估仅对 keyword_match 和 semantic_match 类别执行
|
||||||
|
4. 报告增加"输出质量评估"章节
|
||||||
|
|
||||||
|
**Test scenarios:**
|
||||||
|
- 路由到 code_reviewer 的查询,输出应包含代码审查相关内容
|
||||||
|
- 路由到 content_generator 的查询,输出应包含生成内容
|
||||||
|
- 路由失败的查询,不执行 L3 评估
|
||||||
|
|
||||||
|
**Verification:** 报告包含输出质量评分,平均分 > 3.0
|
||||||
|
|
||||||
|
### U4. L5 自适应能力测试
|
||||||
|
|
||||||
|
**Goal:** 测试同一意图不同表达的路由稳定性
|
||||||
|
|
||||||
|
**Requirements:** R3
|
||||||
|
|
||||||
|
**Dependencies:** U1
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- `tests/e2e/benchmark_dataset.py` — 增加自适应测试用例
|
||||||
|
- `tests/e2e/test_capability_router_direct.py` — 增加自适应测试类
|
||||||
|
|
||||||
|
**Approach:**
|
||||||
|
1. 选取 5 个核心技能,每个技能设计 3 种表达变体:正式/口语/中英混合
|
||||||
|
2. 同一技能的 3 种表达应路由到同一技能
|
||||||
|
3. 计算自适应率:同一技能不同表达路由一致的比例
|
||||||
|
|
||||||
|
**Test scenarios:**
|
||||||
|
- code_reviewer: "审查代码" / "帮我看看代码" / "review this code"
|
||||||
|
- trend_agent: "分析趋势" / "最近行情怎么样" / "market trend analysis"
|
||||||
|
- content_generator: "生成内容" / "帮我写点东西" / "write an article"
|
||||||
|
- citation_detector: "检测引用" / "引用对不对" / "check citations"
|
||||||
|
- competitor_analyzer: "竞品分析" / "对手怎么样" / "competitor analysis"
|
||||||
|
|
||||||
|
**Verification:** 自适应率 > 60%(5个技能 x 3种表达 = 15个用例,至少9个路由一致)
|
||||||
|
|
||||||
|
### U5. 对比报告与基准更新
|
||||||
|
|
||||||
|
**Goal:** 生成 SemanticRouter 启用前后的对比报告,更新基准
|
||||||
|
|
||||||
|
**Requirements:** R4
|
||||||
|
|
||||||
|
**Dependencies:** U1, U2, U3, U4
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- `tests/e2e/capability_metrics.py` — 增加对比报告生成
|
||||||
|
- `test-results/e2e/capability_report.txt` — 更新报告
|
||||||
|
|
||||||
|
**Approach:**
|
||||||
|
1. 运行完整回测(含 SemanticRouter)
|
||||||
|
2. 与启用前基准对比:执行模式准确率、技能路由F1、keyword_match F1
|
||||||
|
3. 报告增加"SemanticRouter 效果对比"章节
|
||||||
|
4. 报告增加"L3 输出质量"和"L5 自适应能力"章节
|
||||||
|
|
||||||
|
**Verification:** 报告包含前后对比数据,技能路由F1 > 80%
|
||||||
|
|
||||||
|
## Scope Boundaries
|
||||||
|
|
||||||
|
### In Scope
|
||||||
|
- 启用 SemanticRouter
|
||||||
|
- L3 输出质量评估(LLM-as-Judge)
|
||||||
|
- L5 自适应能力测试
|
||||||
|
- 对比报告生成
|
||||||
|
|
||||||
|
### Out of Scope
|
||||||
|
- L4 对话连贯性测试(多轮对话,需要会话管理改造)
|
||||||
|
- L6 压力边界测试(模糊/对抗输入,需要专门的对抗测试框架)
|
||||||
|
- 意图分类微调(需要标注数据和训练流程)
|
||||||
|
- 关键词自动扩充(从 examples 提取高频词)
|
||||||
|
|
||||||
|
### Deferred to Follow-Up Work
|
||||||
|
- 多轮对话回测框架
|
||||||
|
- 对抗性输入测试
|
||||||
|
- 意图分类微调流水线
|
||||||
|
- 关键词自动扩充工具
|
||||||
|
|
||||||
|
## Risks
|
||||||
|
|
||||||
|
| Risk | Likelihood | Impact | Mitigation |
|
||||||
|
|------|-----------|--------|------------|
|
||||||
|
| Embedding API 不可用 | Medium | High | 回测跳过 SemanticRouter,降级到纯关键词路由 |
|
||||||
|
| LLM-as-Judge 评分不稳定 | Medium | Medium | 多次评估取平均,使用结构化评分 prompt |
|
||||||
|
| SemanticRouter 阈值需调优 | High | Low | 先用默认值,根据回测结果微调 |
|
||||||
|
|
@ -0,0 +1,56 @@
|
||||||
|
"""Quick test for SemanticRouter similarity on colloquial queries."""
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import dotenv
|
||||||
|
|
||||||
|
dotenv.load_dotenv()
|
||||||
|
|
||||||
|
from agentkit.chat.semantic_router import SemanticRouter
|
||||||
|
from agentkit.memory.embedder import OpenAIEmbedder
|
||||||
|
from agentkit.skills.registry import SkillRegistry
|
||||||
|
from agentkit.skills.loader import SkillLoader
|
||||||
|
from agentkit.server.config import ServerConfig
|
||||||
|
|
||||||
|
config = ServerConfig.from_yaml("agentkit.yaml")
|
||||||
|
key = os.environ.get("DASHSCOPE_API_KEY", "")
|
||||||
|
# Set API key and base_url for the first provider that needs it
|
||||||
|
for name, pconf in config.llm_config.providers.items():
|
||||||
|
if not pconf.api_key and key:
|
||||||
|
pconf.api_key = key
|
||||||
|
if not pconf.base_url:
|
||||||
|
pconf.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
|
||||||
|
break
|
||||||
|
|
||||||
|
provider = config.llm_config.providers.get("test") or list(config.llm_config.providers.values())[0]
|
||||||
|
print(f"Using provider: api_key_len={len(provider.api_key)}, base_url={provider.base_url}")
|
||||||
|
|
||||||
|
embedder = OpenAIEmbedder(
|
||||||
|
api_key=provider.api_key,
|
||||||
|
base_url=provider.base_url,
|
||||||
|
model="text-embedding-v3",
|
||||||
|
)
|
||||||
|
|
||||||
|
router = SemanticRouter(embedder=embedder, similarity_low=0.4)
|
||||||
|
sr = SkillRegistry()
|
||||||
|
loader = SkillLoader(sr)
|
||||||
|
skills = loader.load_from_directory("configs/skills")
|
||||||
|
print(f"Loaded {len(skills)} skills: {[s.name for s in skills]}")
|
||||||
|
|
||||||
|
asyncio.run(router.build_index(sr))
|
||||||
|
print(f"SemanticRouter index size: {router._index.size}")
|
||||||
|
|
||||||
|
queries = [
|
||||||
|
"帮我看看代码有没有问题",
|
||||||
|
"对手怎么样",
|
||||||
|
"帮我写点东西",
|
||||||
|
"这个引用对不对",
|
||||||
|
"最近市场行情怎么样",
|
||||||
|
"review一下这段代码",
|
||||||
|
"做个SEO优化",
|
||||||
|
"monitor一下系统状态",
|
||||||
|
"审查代码",
|
||||||
|
"分析竞品策略",
|
||||||
|
]
|
||||||
|
for q in queries:
|
||||||
|
result = asyncio.run(router.route(q))
|
||||||
|
print(f"{q:30s} -> skill={str(result.skill_name):25s} sim={result.similarity:.3f} conf={result.confidence}")
|
||||||
|
|
@ -97,6 +97,10 @@ class SkillEmbeddingIndex:
|
||||||
if intent and hasattr(intent, "keywords") and intent.keywords:
|
if intent and hasattr(intent, "keywords") and intent.keywords:
|
||||||
parts.append(" ".join(intent.keywords))
|
parts.append(" ".join(intent.keywords))
|
||||||
|
|
||||||
|
# Intent examples (rich semantic signal for short queries)
|
||||||
|
if intent and hasattr(intent, "examples") and intent.examples:
|
||||||
|
parts.append(" ".join(intent.examples))
|
||||||
|
|
||||||
# Capability tags
|
# Capability tags
|
||||||
capabilities = getattr(config, "capabilities", None)
|
capabilities = getattr(config, "capabilities", None)
|
||||||
if capabilities:
|
if capabilities:
|
||||||
|
|
@ -128,15 +132,20 @@ class SemanticRouter:
|
||||||
|
|
||||||
Three confidence zones:
|
Three confidence zones:
|
||||||
- similarity > similarity_high (0.85): HIGH → direct skill match, skip Layer 2
|
- similarity > similarity_high (0.85): HIGH → direct skill match, skip Layer 2
|
||||||
- similarity_low (0.6) <= similarity <= similarity_high: MEDIUM → skill hint for Layer 2
|
- similarity_low (0.4) <= similarity <= similarity_high: MEDIUM → skill hint for Layer 2
|
||||||
- similarity < similarity_low (0.6): LOW → no semantic signal, normal routing
|
- similarity < similarity_low (0.4): LOW → no semantic signal, normal routing
|
||||||
|
|
||||||
|
Short text (<20 chars) uses a lower effective threshold because
|
||||||
|
brief queries naturally have lower embedding similarity.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
_SHORT_TEXT_THRESHOLD = 20 # chars
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
embedder: Embedder,
|
embedder: Embedder,
|
||||||
similarity_high: float = 0.85,
|
similarity_high: float = 0.85,
|
||||||
similarity_low: float = 0.6,
|
similarity_low: float = 0.4,
|
||||||
):
|
):
|
||||||
self._embedder = embedder
|
self._embedder = embedder
|
||||||
self._similarity_high = similarity_high
|
self._similarity_high = similarity_high
|
||||||
|
|
@ -169,6 +178,9 @@ class SemanticRouter:
|
||||||
if self._index.size == 0:
|
if self._index.size == 0:
|
||||||
return SemanticRouteResult(confidence="low", skill_name=None, similarity=0.0)
|
return SemanticRouteResult(confidence="low", skill_name=None, similarity=0.0)
|
||||||
|
|
||||||
|
if not query or not query.strip():
|
||||||
|
return SemanticRouteResult(confidence="low", skill_name=None, similarity=0.0)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Get query embedding (with cache)
|
# Get query embedding (with cache)
|
||||||
query_embedding = self._query_cache.get(query)
|
query_embedding = self._query_cache.get(query)
|
||||||
|
|
@ -183,13 +195,18 @@ class SemanticRouter:
|
||||||
|
|
||||||
best_skill, best_sim = results[0]
|
best_skill, best_sim = results[0]
|
||||||
|
|
||||||
|
# Short text uses lower effective threshold
|
||||||
|
effective_low = self._similarity_low
|
||||||
|
if len(query) < self._SHORT_TEXT_THRESHOLD:
|
||||||
|
effective_low = max(0.25, self._similarity_low - 0.15)
|
||||||
|
|
||||||
if best_sim >= self._similarity_high:
|
if best_sim >= self._similarity_high:
|
||||||
return SemanticRouteResult(
|
return SemanticRouteResult(
|
||||||
confidence="high",
|
confidence="high",
|
||||||
skill_name=best_skill,
|
skill_name=best_skill,
|
||||||
similarity=best_sim,
|
similarity=best_sim,
|
||||||
)
|
)
|
||||||
elif best_sim >= self._similarity_low:
|
elif best_sim >= effective_low:
|
||||||
return SemanticRouteResult(
|
return SemanticRouteResult(
|
||||||
confidence="medium",
|
confidence="medium",
|
||||||
skill_name=best_skill,
|
skill_name=best_skill,
|
||||||
|
|
|
||||||
|
|
@ -526,6 +526,7 @@ class HeuristicClassifier:
|
||||||
}
|
}
|
||||||
|
|
||||||
# 低复杂度暗示词(问候/闲聊/简单定义,不需要工具)
|
# 低复杂度暗示词(问候/闲聊/简单定义,不需要工具)
|
||||||
|
# 注意:不包含"怎么样"、"今天"等通用疑问/时间词,因为它们可搭配高复杂度问题
|
||||||
_LOW_COMPLEXITY_HINTS_CN = {
|
_LOW_COMPLEXITY_HINTS_CN = {
|
||||||
"你好",
|
"你好",
|
||||||
"嗨",
|
"嗨",
|
||||||
|
|
@ -539,9 +540,6 @@ class HeuristicClassifier:
|
||||||
"你叫什么",
|
"你叫什么",
|
||||||
"你是什么",
|
"你是什么",
|
||||||
"自我介绍",
|
"自我介绍",
|
||||||
"天气",
|
|
||||||
"今天",
|
|
||||||
"怎么样",
|
|
||||||
"闲聊",
|
"闲聊",
|
||||||
"聊天",
|
"聊天",
|
||||||
}
|
}
|
||||||
|
|
@ -645,10 +643,10 @@ class HeuristicClassifier:
|
||||||
self._MEDIUM_EXACT_RE.findall(content)
|
self._MEDIUM_EXACT_RE.findall(content)
|
||||||
)
|
)
|
||||||
|
|
||||||
has_high_signal = high_hits > 0 or medium_hits > 0
|
has_non_low_signal = high_hits > 0 or medium_hits > 0
|
||||||
|
|
||||||
# 低复杂度信号仅在无高/中复杂度信号时生效
|
# 低复杂度信号仅在无高/中复杂度信号时生效
|
||||||
if has_low_signal and not has_high_signal:
|
if has_low_signal and not has_non_low_signal:
|
||||||
score = 0.05 # 问候/闲聊直接给极低分
|
score = 0.05 # 问候/闲聊直接给极低分
|
||||||
length = len(content)
|
length = len(content)
|
||||||
if length > 200:
|
if length > 200:
|
||||||
|
|
@ -855,8 +853,11 @@ class CostAwareRouter:
|
||||||
merged_complexity = max(0.0, min(1.0, merged_complexity))
|
merged_complexity = max(0.0, min(1.0, merged_complexity))
|
||||||
skill_hint = data.get("skill_hint")
|
skill_hint = data.get("skill_hint")
|
||||||
|
|
||||||
# If skill_hint provided and valid, route directly to that skill
|
# Validate skill_hint against name pattern before lookup
|
||||||
if skill_hint and skill_registry:
|
if skill_hint and skill_registry:
|
||||||
|
if not _SKILL_NAME_RE.match(str(skill_hint).strip().lower()):
|
||||||
|
logger.warning(f"Invalid skill_hint from LLM: {skill_hint!r}")
|
||||||
|
skill_hint = None
|
||||||
try:
|
try:
|
||||||
matched_skill = skill_registry.get(skill_hint)
|
matched_skill = skill_registry.get(skill_hint)
|
||||||
result = SkillRoutingResult(
|
result = SkillRoutingResult(
|
||||||
|
|
@ -868,7 +869,7 @@ class CostAwareRouter:
|
||||||
match_method="merged_llm",
|
match_method="merged_llm",
|
||||||
match_confidence=0.7,
|
match_confidence=0.7,
|
||||||
complexity=merged_complexity,
|
complexity=merged_complexity,
|
||||||
execution_mode=ExecutionMode.SKILL_REACT,
|
execution_mode=_resolve_execution_mode(matched_skill.config),
|
||||||
)
|
)
|
||||||
# Merge tools
|
# Merge tools
|
||||||
agent_tools = (
|
agent_tools = (
|
||||||
|
|
@ -1323,8 +1324,84 @@ class CostAwareRouter:
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Low complexity → direct chat
|
# Low complexity → try semantic match, then IntentRouter, then direct chat
|
||||||
if complexity < 0.3:
|
if complexity < 0.3:
|
||||||
|
# Even low-complexity queries may match a skill semantically
|
||||||
|
if self._semantic_router is not None:
|
||||||
|
try:
|
||||||
|
semantic_result = await self._semantic_router.route(clean_content)
|
||||||
|
if (
|
||||||
|
semantic_result.confidence in ("high", "medium")
|
||||||
|
and semantic_result.skill_name
|
||||||
|
):
|
||||||
|
trace.append(
|
||||||
|
{
|
||||||
|
"layer": 1.5,
|
||||||
|
"method": "semantic_low_complexity_match",
|
||||||
|
"skill": semantic_result.skill_name,
|
||||||
|
"similarity": round(semantic_result.similarity, 3),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
result = await resolve_skill_routing(
|
||||||
|
content=content,
|
||||||
|
skill_registry=skill_registry,
|
||||||
|
intent_router=intent_router,
|
||||||
|
default_tools=default_tools,
|
||||||
|
default_system_prompt=default_system_prompt,
|
||||||
|
default_model=default_model,
|
||||||
|
default_agent_name=default_agent_name,
|
||||||
|
agent_tool_registry=agent_tool_registry,
|
||||||
|
session_id=session_id,
|
||||||
|
force_skill=semantic_result.skill_name,
|
||||||
|
)
|
||||||
|
result.match_method = "semantic_low_complexity"
|
||||||
|
result.match_confidence = semantic_result.similarity
|
||||||
|
result.complexity = complexity
|
||||||
|
if result.matched:
|
||||||
|
result.execution_mode = _resolve_execution_mode(result.skill_config)
|
||||||
|
result.execution_trace = trace if transparency != "SILENT" else []
|
||||||
|
result.transparency_level = transparency
|
||||||
|
span.set_attribute("route.layer", "semantic_low_complexity")
|
||||||
|
span.set_attribute("route.target", result.skill_name or "default")
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Semantic routing for low-complexity query failed: {e}")
|
||||||
|
|
||||||
|
# Try IntentRouter keyword match before falling back to direct chat
|
||||||
|
# Low-complexity queries like "翻译这段话" should still match skills
|
||||||
|
if skill_registry and intent_router:
|
||||||
|
try:
|
||||||
|
result = await resolve_skill_routing(
|
||||||
|
content=content,
|
||||||
|
skill_registry=skill_registry,
|
||||||
|
intent_router=intent_router,
|
||||||
|
default_tools=default_tools,
|
||||||
|
default_system_prompt=default_system_prompt,
|
||||||
|
default_model=default_model,
|
||||||
|
default_agent_name=default_agent_name,
|
||||||
|
agent_tool_registry=agent_tool_registry,
|
||||||
|
session_id=session_id,
|
||||||
|
)
|
||||||
|
if result.matched:
|
||||||
|
result.complexity = complexity
|
||||||
|
result.match_method = result.match_method or "intent_low_complexity"
|
||||||
|
trace.append(
|
||||||
|
{
|
||||||
|
"layer": 1,
|
||||||
|
"method": "intent_low_complexity",
|
||||||
|
"skill": result.skill_name,
|
||||||
|
"complexity": complexity,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
result.execution_trace = trace if transparency != "SILENT" else []
|
||||||
|
result.transparency_level = transparency
|
||||||
|
span.set_attribute("route.layer", "intent_low_complexity")
|
||||||
|
span.set_attribute("route.target", result.skill_name or "default")
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Intent routing for low-complexity query failed: {e}")
|
||||||
|
|
||||||
|
# No semantic or intent match → direct chat
|
||||||
result = SkillRoutingResult(
|
result = SkillRoutingResult(
|
||||||
clean_content=clean_content,
|
clean_content=clean_content,
|
||||||
system_prompt=default_system_prompt,
|
system_prompt=default_system_prompt,
|
||||||
|
|
@ -1383,7 +1460,7 @@ class CostAwareRouter:
|
||||||
result.match_confidence = semantic_result.similarity
|
result.match_confidence = semantic_result.similarity
|
||||||
result.complexity = complexity
|
result.complexity = complexity
|
||||||
if result.matched:
|
if result.matched:
|
||||||
result.execution_mode = ExecutionMode.SKILL_REACT
|
result.execution_mode = _resolve_execution_mode(result.skill_config)
|
||||||
result.execution_trace = trace if transparency != "SILENT" else []
|
result.execution_trace = trace if transparency != "SILENT" else []
|
||||||
result.transparency_level = transparency
|
result.transparency_level = transparency
|
||||||
span.set_attribute("route.layer", "semantic_high")
|
span.set_attribute("route.layer", "semantic_high")
|
||||||
|
|
@ -1410,8 +1487,27 @@ class CostAwareRouter:
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Short text fallback: if semantic router returned low confidence
|
||||||
|
# and text is short (<20 chars), force LLM classify for better routing
|
||||||
|
short_text_llm_hint = None
|
||||||
|
if (
|
||||||
|
skill_hint is None
|
||||||
|
and len(clean_content) < 20
|
||||||
|
and self._merged_llm_classify
|
||||||
|
and self._llm_gateway is not None
|
||||||
|
):
|
||||||
|
short_text_llm_hint = True
|
||||||
|
trace.append(
|
||||||
|
{
|
||||||
|
"layer": 1.5,
|
||||||
|
"method": "short_text_llm_fallback",
|
||||||
|
"reason": "semantic_low + short_text",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# Medium complexity → merged LLM classify or IntentRouter
|
# Medium complexity → merged LLM classify or IntentRouter
|
||||||
if complexity <= 0.7:
|
# Short text with no semantic match forces LLM classify
|
||||||
|
if complexity <= 0.7 or short_text_llm_hint:
|
||||||
if self._merged_llm_classify and self._llm_gateway is not None:
|
if self._merged_llm_classify and self._llm_gateway is not None:
|
||||||
# Use merged LLM call: complexity + intent in one call
|
# Use merged LLM call: complexity + intent in one call
|
||||||
result = await self._classify_merged(
|
result = await self._classify_merged(
|
||||||
|
|
|
||||||
|
|
@ -126,12 +126,12 @@ class QualityGate:
|
||||||
and skill_match_check.message
|
and skill_match_check.message
|
||||||
and "Warning" in skill_match_check.message
|
and "Warning" in skill_match_check.message
|
||||||
):
|
):
|
||||||
other_failed = any(not c.passed for c in checks if c is not skill_match_check)
|
other_failed = any(not c.passed for c in checks if c.name != "skill_match")
|
||||||
if other_failed:
|
if other_failed:
|
||||||
# 升级:将 skill_match 的 passed 也设为 False
|
# 升级:将 skill_match 的 passed 也设为 False
|
||||||
checks = [
|
checks = [
|
||||||
QualityCheck(name=c.name, passed=False, message=c.message)
|
QualityCheck(name=c.name, passed=False, message=c.message)
|
||||||
if c is skill_match_check
|
if c.name == "skill_match"
|
||||||
else c
|
else c
|
||||||
for c in checks
|
for c in checks
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -725,6 +725,96 @@ SEMANTIC_ROUTER_BENCHMARKS: list[BenchmarkCase] = [
|
||||||
paraphrases=["竞品对比和差距分析", "Competitive gap analysis"],
|
paraphrases=["竞品对比和差距分析", "Competitive gap analysis"],
|
||||||
tags=["semantic", "competitor"],
|
tags=["semantic", "competitor"],
|
||||||
),
|
),
|
||||||
|
# --- Colloquial / casual expressions (口语化表达) ---
|
||||||
|
BenchmarkCase(
|
||||||
|
id="semantic-colloquial-review-001",
|
||||||
|
input="帮我看看代码有没有问题",
|
||||||
|
expected_skill="code_reviewer",
|
||||||
|
expected_execution_mode="react",
|
||||||
|
expected_complexity="medium",
|
||||||
|
category="semantic_router",
|
||||||
|
subcategory="colloquial_match",
|
||||||
|
paraphrases=["代码审查一下", "Check my code for issues"],
|
||||||
|
tags=["semantic", "colloquial", "code_review"],
|
||||||
|
),
|
||||||
|
BenchmarkCase(
|
||||||
|
id="semantic-colloquial-trend-001",
|
||||||
|
input="最近市场行情怎么样",
|
||||||
|
expected_skill="trend_agent",
|
||||||
|
expected_execution_mode="tool_call",
|
||||||
|
expected_complexity="medium",
|
||||||
|
category="semantic_router",
|
||||||
|
subcategory="colloquial_match",
|
||||||
|
paraphrases=["市场走势如何", "What's the market trend"],
|
||||||
|
tags=["semantic", "colloquial", "trend"],
|
||||||
|
),
|
||||||
|
BenchmarkCase(
|
||||||
|
id="semantic-colloquial-content-001",
|
||||||
|
input="帮我写点东西",
|
||||||
|
expected_skill="content_generator",
|
||||||
|
expected_execution_mode="llm_generate",
|
||||||
|
expected_complexity="low",
|
||||||
|
category="semantic_router",
|
||||||
|
subcategory="colloquial_match",
|
||||||
|
paraphrases=["写篇文章吧", "Write something for me"],
|
||||||
|
tags=["semantic", "colloquial", "content"],
|
||||||
|
),
|
||||||
|
BenchmarkCase(
|
||||||
|
id="semantic-colloquial-citation-001",
|
||||||
|
input="这个引用对不对",
|
||||||
|
expected_skill="citation_detector",
|
||||||
|
expected_execution_mode="custom",
|
||||||
|
expected_complexity="medium",
|
||||||
|
category="semantic_router",
|
||||||
|
subcategory="colloquial_match",
|
||||||
|
paraphrases=["查查引用准不准", "Are these citations correct"],
|
||||||
|
tags=["semantic", "colloquial", "citation"],
|
||||||
|
),
|
||||||
|
BenchmarkCase(
|
||||||
|
id="semantic-colloquial-competitor-001",
|
||||||
|
input="对手怎么样",
|
||||||
|
expected_skill="competitor_analyzer",
|
||||||
|
expected_execution_mode="tool_call",
|
||||||
|
expected_complexity="medium",
|
||||||
|
category="semantic_router",
|
||||||
|
subcategory="colloquial_match",
|
||||||
|
paraphrases=["竞品啥情况", "How are competitors doing"],
|
||||||
|
tags=["semantic", "colloquial", "competitor"],
|
||||||
|
),
|
||||||
|
# --- Mixed Chinese-English expressions (中英混合) ---
|
||||||
|
BenchmarkCase(
|
||||||
|
id="semantic-mixed-review-001",
|
||||||
|
input="review一下这段代码",
|
||||||
|
expected_skill="code_reviewer",
|
||||||
|
expected_execution_mode="react",
|
||||||
|
expected_complexity="medium",
|
||||||
|
category="semantic_router",
|
||||||
|
subcategory="mixed_lang_match",
|
||||||
|
paraphrases=["帮我review代码", "Code review please"],
|
||||||
|
tags=["semantic", "mixed", "code_review"],
|
||||||
|
),
|
||||||
|
BenchmarkCase(
|
||||||
|
id="semantic-mixed-geo-001",
|
||||||
|
input="做个SEO优化",
|
||||||
|
expected_skill="geo_optimizer",
|
||||||
|
expected_execution_mode="llm_generate",
|
||||||
|
expected_complexity="low",
|
||||||
|
category="semantic_router",
|
||||||
|
subcategory="mixed_lang_match",
|
||||||
|
paraphrases=["GEO优化一下", "Optimize for AI search"],
|
||||||
|
tags=["semantic", "mixed", "geo"],
|
||||||
|
),
|
||||||
|
BenchmarkCase(
|
||||||
|
id="semantic-mixed-monitor-001",
|
||||||
|
input="monitor一下系统状态",
|
||||||
|
expected_skill="monitor",
|
||||||
|
expected_execution_mode="tool_call",
|
||||||
|
expected_complexity="medium",
|
||||||
|
category="semantic_router",
|
||||||
|
subcategory="mixed_lang_match",
|
||||||
|
paraphrases=["监控系统运行", "Monitor system status"],
|
||||||
|
tags=["semantic", "mixed", "monitor"],
|
||||||
|
),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -74,6 +74,24 @@ class CapabilityObservation(BaseModel):
|
||||||
alignment_violations: int = 0 # Number of constraint violations detected
|
alignment_violations: int = 0 # Number of constraint violations detected
|
||||||
cascade_alert: bool = False # Whether a cascade alert was triggered
|
cascade_alert: bool = False # Whether a cascade alert was triggered
|
||||||
|
|
||||||
|
# L3 Output Quality fields
|
||||||
|
output_quality_score: float | None = None # 1-5 LLM-as-Judge score
|
||||||
|
output_quality_reasoning: str | None = None # Judge's reasoning
|
||||||
|
|
||||||
|
|
||||||
|
class OutputQualityObservation(BaseModel):
|
||||||
|
"""L3 output quality evaluation result."""
|
||||||
|
|
||||||
|
model_config = ConfigDict()
|
||||||
|
|
||||||
|
benchmark_id: str
|
||||||
|
input_query: str
|
||||||
|
expected_skill: str | None = None
|
||||||
|
actual_skill: str | None = None
|
||||||
|
quality_score: float = 0.0 # 1-5
|
||||||
|
reasoning: str = ""
|
||||||
|
evaluated: bool = False
|
||||||
|
|
||||||
|
|
||||||
class CategoryMetrics(BaseModel):
|
class CategoryMetrics(BaseModel):
|
||||||
"""Aggregate metrics for a specific category/subcategory."""
|
"""Aggregate metrics for a specific category/subcategory."""
|
||||||
|
|
@ -178,6 +196,7 @@ class CapabilityReport(BaseModel):
|
||||||
root_causes: list[RootCause]
|
root_causes: list[RootCause]
|
||||||
improvement_plans: list[ImprovementPlan]
|
improvement_plans: list[ImprovementPlan]
|
||||||
raw_observations: list[CapabilityObservation]
|
raw_observations: list[CapabilityObservation]
|
||||||
|
output_quality_evaluations: list[OutputQualityObservation] = []
|
||||||
|
|
||||||
|
|
||||||
# ═══════════════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
|
@ -295,6 +314,93 @@ class MetricsCollector:
|
||||||
"""Get paraphrase observations only."""
|
"""Get paraphrase observations only."""
|
||||||
return [o for o in self._observations if o.is_paraphrase]
|
return [o for o in self._observations if o.is_paraphrase]
|
||||||
|
|
||||||
|
def evaluate_output_quality(
|
||||||
|
self, llm_gateway: Any
|
||||||
|
) -> list[OutputQualityObservation]:
|
||||||
|
"""L3 Output Quality Evaluation using LLM-as-Judge.
|
||||||
|
|
||||||
|
Evaluates only keyword_match and semantic_match categories.
|
||||||
|
Returns list of OutputQualityObservation with quality scores.
|
||||||
|
"""
|
||||||
|
results: list[OutputQualityObservation] = []
|
||||||
|
eval_categories = {"routing", "semantic_router"}
|
||||||
|
|
||||||
|
for obs in self._observations:
|
||||||
|
if obs.category not in eval_categories:
|
||||||
|
continue
|
||||||
|
if obs.actual_skill is None:
|
||||||
|
continue
|
||||||
|
if not obs.task_succeeded:
|
||||||
|
continue
|
||||||
|
|
||||||
|
prompt = (
|
||||||
|
f"评估以下Agent路由-执行结果的质量(1-5分)。\n\n"
|
||||||
|
f"用户输入: {obs.input_query}\n"
|
||||||
|
f"期望技能: {obs.expected_skill}\n"
|
||||||
|
f"实际路由技能: {obs.actual_skill}\n"
|
||||||
|
f"执行模式: {obs.actual_execution_mode}\n\n"
|
||||||
|
f"评分标准:\n"
|
||||||
|
f"1分: 完全错误的路由,输出与用户意图无关\n"
|
||||||
|
f"2分: 路由有偏差,输出部分相关但缺少关键内容\n"
|
||||||
|
f"3分: 路由基本正确,输出相关但不完整\n"
|
||||||
|
f"4分: 路由正确,输出完整且相关\n"
|
||||||
|
f"5分: 路由精准,输出完全匹配用户意图且质量优秀\n\n"
|
||||||
|
f"请只输出JSON: {{\"score\": <1-5>, \"reasoning\": \"<一句话理由>\"}}"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
response = asyncio.run(
|
||||||
|
llm_gateway.chat(
|
||||||
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
model="default",
|
||||||
|
temperature=0.0,
|
||||||
|
max_tokens=200,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
content = response.get("content", "") if isinstance(response, dict) else str(response)
|
||||||
|
|
||||||
|
# Parse JSON from response
|
||||||
|
import re
|
||||||
|
|
||||||
|
json_match = re.search(r'\{[^}]+\}', content)
|
||||||
|
if json_match:
|
||||||
|
import json as _json
|
||||||
|
|
||||||
|
parsed = _json.loads(json_match.group())
|
||||||
|
score = float(parsed.get("score", 0))
|
||||||
|
reasoning = parsed.get("reasoning", "")
|
||||||
|
else:
|
||||||
|
score = 0.0
|
||||||
|
reasoning = f"Parse failed: {content[:100]}"
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
OutputQualityObservation(
|
||||||
|
benchmark_id=obs.benchmark_id,
|
||||||
|
input_query=obs.input_query,
|
||||||
|
expected_skill=obs.expected_skill,
|
||||||
|
actual_skill=obs.actual_skill,
|
||||||
|
quality_score=max(1.0, min(5.0, score)),
|
||||||
|
reasoning=reasoning,
|
||||||
|
evaluated=True,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
results.append(
|
||||||
|
OutputQualityObservation(
|
||||||
|
benchmark_id=obs.benchmark_id,
|
||||||
|
input_query=obs.input_query,
|
||||||
|
expected_skill=obs.expected_skill,
|
||||||
|
actual_skill=obs.actual_skill,
|
||||||
|
quality_score=0.0,
|
||||||
|
reasoning=f"Evaluation error: {e}",
|
||||||
|
evaluated=False,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
# ═══════════════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════════════
|
||||||
# 3. Metrics Analyzer
|
# 3. Metrics Analyzer
|
||||||
|
|
@ -1348,6 +1454,42 @@ class MetricsReporter:
|
||||||
lines.append(f" └{'─' * 60}")
|
lines.append(f" └{'─' * 60}")
|
||||||
lines.append("")
|
lines.append("")
|
||||||
|
|
||||||
|
# L3 Output Quality Evaluation
|
||||||
|
if report.output_quality_evaluations:
|
||||||
|
lines.append("── L3 输出质量评估 ──────────────────────────────────────────")
|
||||||
|
evaluated = [e for e in report.output_quality_evaluations if e.evaluated]
|
||||||
|
if evaluated:
|
||||||
|
avg_score = sum(e.quality_score for e in evaluated) / len(evaluated)
|
||||||
|
lines.append(f" 评估样本数: {len(evaluated)}")
|
||||||
|
lines.append(f" 平均质量评分: {avg_score:.2f}/5.0")
|
||||||
|
score_dist = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
|
||||||
|
for e in evaluated:
|
||||||
|
bucket = max(1, min(5, int(e.quality_score)))
|
||||||
|
score_dist[bucket] += 1
|
||||||
|
lines.append(f" 评分分布: 1分:{score_dist[1]} 2分:{score_dist[2]} 3分:{score_dist[3]} 4分:{score_dist[4]} 5分:{score_dist[5]}")
|
||||||
|
# Show some examples
|
||||||
|
lines.append("")
|
||||||
|
lines.append(" 样例:")
|
||||||
|
for e in evaluated[:5]:
|
||||||
|
lines.append(f" [{e.benchmark_id}] 评分={e.quality_score:.0f} 期望={e.expected_skill} 实际={e.actual_skill}")
|
||||||
|
if e.reasoning:
|
||||||
|
lines.append(f" 理由: {e.reasoning}")
|
||||||
|
else:
|
||||||
|
lines.append(" 无有效评估结果")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
# L5 Adaptive Capability (reuse overfitting consistency data)
|
||||||
|
if report.overfitting_results:
|
||||||
|
lines.append("── L5 自适应能力 ──────────────────────────────────────────")
|
||||||
|
consistency_rates = [r.consistency_rate for r in report.overfitting_results]
|
||||||
|
if consistency_rates:
|
||||||
|
avg_consistency = sum(consistency_rates) / len(consistency_rates)
|
||||||
|
lines.append(f" 测试组数: {len(consistency_rates)}")
|
||||||
|
lines.append(f" 平均自适应率: {avg_consistency:.2%}")
|
||||||
|
high_adapt = sum(1 for r in consistency_rates if r >= 0.8)
|
||||||
|
lines.append(f" 高自适应(>=80%): {high_adapt}/{len(consistency_rates)}")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
lines.append("=" * 72)
|
lines.append("=" * 72)
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -48,6 +48,20 @@ def pytest_sessionfinish(session: pytest.Session, exitstatus: int) -> None:
|
||||||
analyzer = MetricsAnalyzer()
|
analyzer = MetricsAnalyzer()
|
||||||
report = analyzer.generate_report(collector)
|
report = analyzer.generate_report(collector)
|
||||||
|
|
||||||
|
# L3 Output Quality Evaluation (optional, requires LLM)
|
||||||
|
try:
|
||||||
|
from tests.e2e.test_capability_router_direct import _get_components
|
||||||
|
|
||||||
|
router, skill_registry, intent_router = _get_components()
|
||||||
|
llm_gateway = getattr(router, "_llm_gateway", None)
|
||||||
|
if llm_gateway is not None:
|
||||||
|
quality_evals = collector.evaluate_output_quality(llm_gateway)
|
||||||
|
report = analyzer.generate_report(collector)
|
||||||
|
# Attach quality evaluations to report
|
||||||
|
report.output_quality_evaluations = quality_evals
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: L3 output quality evaluation skipped: {e}")
|
||||||
|
|
||||||
output_dir = os.path.join(os.path.dirname(__file__), "..", "..", "test-results", "e2e")
|
output_dir = os.path.join(os.path.dirname(__file__), "..", "..", "test-results", "e2e")
|
||||||
paths = MetricsReporter.save_report(report, output_dir)
|
paths = MetricsReporter.save_report(report, output_dir)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -87,8 +87,12 @@ def _build_real_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRout
|
||||||
if not pconf.api_key:
|
if not pconf.api_key:
|
||||||
pconf.api_key = dashscope_key
|
pconf.api_key = dashscope_key
|
||||||
# Set base_url for dashscope if missing
|
# Set base_url for dashscope if missing
|
||||||
|
# Use coding base_url for bailian-coding keys (sk-sp-* prefix)
|
||||||
if not pconf.base_url:
|
if not pconf.base_url:
|
||||||
pconf.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
|
if dashscope_key.startswith("sk-sp-"):
|
||||||
|
pconf.base_url = "https://coding.dashscope.aliyuncs.com/v1"
|
||||||
|
else:
|
||||||
|
pconf.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1"
|
||||||
break
|
break
|
||||||
|
|
||||||
if not server_config.has_llm_provider():
|
if not server_config.has_llm_provider():
|
||||||
|
|
@ -105,6 +109,64 @@ def _build_real_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRout
|
||||||
|
|
||||||
# Build real CostAwareRouter
|
# Build real CostAwareRouter
|
||||||
router_conf = server_config.router or {}
|
router_conf = server_config.router or {}
|
||||||
|
|
||||||
|
# Build SemanticRouter if enabled or if embedding is available
|
||||||
|
semantic_router = None
|
||||||
|
semantic_conf = router_conf.get("semantic", {})
|
||||||
|
if semantic_conf.get("enabled", False):
|
||||||
|
try:
|
||||||
|
from agentkit.chat.semantic_router import SemanticRouter
|
||||||
|
from agentkit.memory.embedder import OpenAIEmbedder
|
||||||
|
|
||||||
|
# Try to get embedder from LLM gateway cache first
|
||||||
|
embedder = getattr(llm_gateway, "_embedder", None)
|
||||||
|
|
||||||
|
# If no cache embedder, create one directly from provider config
|
||||||
|
if embedder is None:
|
||||||
|
# Find a provider with an API key to use for embedding
|
||||||
|
for pname, pconf in server_config.llm_config.providers.items():
|
||||||
|
if pconf.api_key:
|
||||||
|
# Use correct base_url based on key prefix
|
||||||
|
if pconf.api_key.startswith("sk-sp-"):
|
||||||
|
base_url = pconf.base_url or "https://coding.dashscope.aliyuncs.com/v1"
|
||||||
|
else:
|
||||||
|
base_url = pconf.base_url or "https://dashscope.aliyuncs.com/compatible-mode/v1"
|
||||||
|
embedder = OpenAIEmbedder(
|
||||||
|
api_key=pconf.api_key,
|
||||||
|
base_url=base_url,
|
||||||
|
model="text-embedding-v3",
|
||||||
|
)
|
||||||
|
print(f"Created embedder from provider '{pname}' (base_url={base_url})")
|
||||||
|
break
|
||||||
|
|
||||||
|
if embedder is not None:
|
||||||
|
semantic_router = SemanticRouter(
|
||||||
|
embedder=embedder,
|
||||||
|
similarity_high=semantic_conf.get("similarity_high", 0.85),
|
||||||
|
similarity_low=semantic_conf.get("similarity_low", 0.4),
|
||||||
|
)
|
||||||
|
# Build skill embedding index
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
except RuntimeError:
|
||||||
|
loop = None
|
||||||
|
|
||||||
|
if loop and loop.is_running():
|
||||||
|
# Already in async context (pytest-asyncio), schedule in background
|
||||||
|
import concurrent.futures
|
||||||
|
|
||||||
|
with concurrent.futures.ThreadPoolExecutor() as pool:
|
||||||
|
pool.submit(asyncio.run, semantic_router.build_index(skill_registry)).result()
|
||||||
|
else:
|
||||||
|
asyncio.run(semantic_router.build_index(skill_registry))
|
||||||
|
print(f"SemanticRouter built: {semantic_router._index.size} skills indexed")
|
||||||
|
else:
|
||||||
|
print("Warning: No embedder available for SemanticRouter")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: SemanticRouter not available: {e}")
|
||||||
|
|
||||||
router = CostAwareRouter(
|
router = CostAwareRouter(
|
||||||
llm_gateway=llm_gateway,
|
llm_gateway=llm_gateway,
|
||||||
model="default",
|
model="default",
|
||||||
|
|
@ -112,6 +174,7 @@ def _build_real_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRout
|
||||||
auction_enabled=router_conf.get("auction_enabled", False),
|
auction_enabled=router_conf.get("auction_enabled", False),
|
||||||
classifier=router_conf.get("classifier", "heuristic"),
|
classifier=router_conf.get("classifier", "heuristic"),
|
||||||
merged_llm_classify=router_conf.get("merged_llm_classify", True),
|
merged_llm_classify=router_conf.get("merged_llm_classify", True),
|
||||||
|
semantic_router=semantic_router,
|
||||||
)
|
)
|
||||||
|
|
||||||
return router, skill_registry, intent_router
|
return router, skill_registry, intent_router
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue