From e984b4c4626b398f3990ca3ec6e65697864db952 Mon Sep 17 00:00:00 2001 From: chiguyong Date: Mon, 15 Jun 2026 22:43:13 +0800 Subject: [PATCH] =?UTF-8?q?feat(router):=20optimize=20routing=20intelligen?= =?UTF-8?q?ce=20=E2=80=94=20ExecutionMode=20expansion,=20multi-candidate?= =?UTF-8?q?=20scoring,=20quality=20gate=20skill=20match?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Expand ExecutionMode enum with REWOO/REFLEXION/PLAN_EXEC - Add _resolve_execution_mode() to respect skill.config.execution_mode - Rewrite IntentRouter._match_keywords() for multi-candidate scoring - Add QualityGate 5th dimension: skill_match validation with warning escalation - Calibrate HeuristicClassifier: low-complexity signals only when no high signals - Fix negation regex for Chinese text (avoid matching past punctuation) - Fix backtest mode_map normalization and .env loading - Add 61 unit tests (21 HeuristicClassifier + 14 IntentRouter + 13 QualityGate + 13 existing) Results: execution_mode_accuracy 9.09%→36.36%, skill_routing_F1 66.67%→77.78% --- ...02-feat-e2e-capability-improvement-plan.md | 280 ++++ ...t-router-intelligence-optimization-plan.md | 326 ++++ scripts/run_e2e.sh | 328 ++++ src/agentkit/chat/skill_routing.py | 66 +- src/agentkit/quality/gate.py | 103 +- src/agentkit/router/intent.py | 52 +- tests/e2e/__init__.py | 11 + tests/e2e/benchmark_dataset.py | 830 ++++++++++ tests/e2e/benchmark_generator.py | 339 ++++ tests/e2e/capability_metrics.py | 1366 +++++++++++++++++ tests/e2e/conftest.py | 413 +++++ tests/e2e/test_basic_api.py | 277 ++++ tests/e2e/test_basic_cli.py | 353 +++++ tests/e2e/test_basic_websocket.py | 170 ++ tests/e2e/test_capability_alignment.py | 305 ++++ tests/e2e/test_capability_react.py | 324 ++++ tests/e2e/test_capability_router_direct.py | 342 +++++ tests/e2e/test_capability_routing.py | 273 ++++ tests/e2e/test_capability_team.py | 252 +++ tests/unit/chat/test_skill_routing.py | 332 ++++ tests/unit/quality/__init__.py | 0 tests/unit/quality/test_gate.py | 172 +++ tests/unit/router/test_intent.py | 200 +++ 23 files changed, 7048 insertions(+), 66 deletions(-) create mode 100644 docs/plans/2026-06-15-002-feat-e2e-capability-improvement-plan.md create mode 100644 docs/plans/2026-06-15-003-feat-router-intelligence-optimization-plan.md create mode 100755 scripts/run_e2e.sh create mode 100644 tests/e2e/__init__.py create mode 100644 tests/e2e/benchmark_dataset.py create mode 100644 tests/e2e/benchmark_generator.py create mode 100644 tests/e2e/capability_metrics.py create mode 100644 tests/e2e/conftest.py create mode 100644 tests/e2e/test_basic_api.py create mode 100644 tests/e2e/test_basic_cli.py create mode 100644 tests/e2e/test_basic_websocket.py create mode 100644 tests/e2e/test_capability_alignment.py create mode 100644 tests/e2e/test_capability_react.py create mode 100644 tests/e2e/test_capability_router_direct.py create mode 100644 tests/e2e/test_capability_routing.py create mode 100644 tests/e2e/test_capability_team.py create mode 100644 tests/unit/chat/test_skill_routing.py create mode 100644 tests/unit/quality/__init__.py create mode 100644 tests/unit/quality/test_gate.py create mode 100644 tests/unit/router/test_intent.py diff --git a/docs/plans/2026-06-15-002-feat-e2e-capability-improvement-plan.md b/docs/plans/2026-06-15-002-feat-e2e-capability-improvement-plan.md new file mode 100644 index 0000000..e8f9c75 --- /dev/null +++ b/docs/plans/2026-06-15-002-feat-e2e-capability-improvement-plan.md @@ -0,0 +1,280 @@ +--- +title: "feat: E2E能力分析框架改进与路由智能化提升" +type: feat +status: active +created: 2026-06-15 +plan-depth: standard +--- + +# E2E能力分析框架改进与路由智能化提升 + +## Summary + +改进E2E能力分析框架,解决当前基准数据集与实际技能不对应、覆盖面窄(仅19条)、指标判断过于简化等核心问题。同时将ExpertTeamRouter集成到CostAwareRouter自动触发链路,增加路由器直接回测层,并将基准用例扩展至60条,使召回率/F1/过拟合检测等指标具备统计意义。 + +## Problem Frame + +当前E2E能力分析框架存在四个关键问题: + +1. **基准数据与实际技能脱节**:`benchmark_dataset.py` 中的 `expected_skill`(如 `email_composer`、`i18n_translator`)与 `configs/skills/` 中的15个实际技能不对应,导致路由回测结果无意义 +2. **覆盖面过窄**:仅19条基准用例,PRF统计不稳定;缺少 SemanticRouter、ExpertTeamRouter、AlignmentGuard 的专项基准 +3. **指标判断粗糙**:`complexity_correct` 直接等于 `execution_mode_correct`,无法独立评估复杂度估算;改进策略中的 `target_module` 引用了旧文件名 +4. **团队路由未自动集成**:`ExpertTeamRouter` 与 `CostAwareRouter` 独立运行,`TEAM_COLLAB` 模式无法自动触发 + +## Requirements + +- R1: 基准数据集中的 `expected_skill` 必须与 `configs/skills/` 中的实际技能一一对应 +- R2: 基准用例数量扩展至60条,覆盖路由/执行/团队/一致性/对齐守卫五个维度 +- R3: 增加路由器直接回测层(不经过HTTP API),能区分路由错误与API层错误 +- R4: `complexity_correct` 独立于 `execution_mode_correct`,基于 HeuristicClassifier 分数与期望复杂度的映射判断 +- R5: ExpertTeamRouter 集成到 CostAwareRouter.route() 中,高复杂度任务自动触发 TEAM_COLLAB +- R6: 增加 SemanticRouter 专项基准(相似度分数分布、三档精确率) +- R7: 增加 AlignmentGuard 约束检查基准 +- R8: 修正改进策略中的 target_module 文件路径 +- R9: 报告输出保持中文 + +## Key Technical Decisions + +### KTD1: 双层回测架构 + +**决策**:在现有HTTP API层E2E测试之上,增加路由器直接回测层。 + +**理由**:纯API测试无法区分"路由器选错了技能"和"API层传递参数出错"两种失败模式。直接回测层调用 `CostAwareRouter.route()` 方法,记录 `SkillRoutingResult` 的完整字段(`match_method`、`match_confidence`、`execution_trace`),使根因分析能精确定位到具体路由层。 + +**替代方案**:保持纯API层测试 → 被否决,因为无法满足R3的精确诊断需求。 + +### KTD2: ExpertTeamRouter 集成方式 + +**决策**:在 `CostAwareRouter._route_layer2()` 末尾增加 ExpertTeamRouter 检查点。当 Layer 2 判定 `execution_mode=REACT` 且 `complexity >= 0.7` 时,调用 `ExpertTeamRouter.resolve()` 判断是否升级为 `TEAM_COLLAB`。 + +**理由**:保持三层路由的递进式架构不变,仅在 Layer 2 出口处增加团队模式升级逻辑,最小化对现有路由流程的侵入。 + +### KTD3: 复杂度正确性判断策略 + +**决策**:基于 HeuristicClassifier 返回的浮点复杂度分数与期望复杂度等级的映射区间判断:`low=[0, 0.3)`、`medium=[0.3, 0.7)`、`high=[0.7, 1.0]`。 + +**理由**:直接使用浮点分数比仅比较执行模式更精确,能区分"复杂度分数0.29被判为low但期望medium"和"复杂度分数0.65被判为medium且期望medium"两种情况。 + +### KTD4: 基准用例与实际技能对齐 + +**决策**:从 `configs/skills/` 的15个实际技能中提取 `intent.keywords` 和 `intent.description`,自动生成基准用例的 `expected_skill`,而非手动硬编码。 + +**理由**:手动维护的技能名容易与实际配置脱节(当前问题)。自动对齐确保基准数据始终反映最新的技能配置。 + +--- + +## Implementation Units + +### U1. 基准数据集与实际技能对齐 + +**Goal**: 修复 benchmark_dataset.py 中 expected_skill 与实际技能的对应关系,扩展至60条用例 + +**Dependencies**: 无 + +**Files**: +- `tests/e2e/benchmark_dataset.py` — 重写基准数据集 +- `tests/e2e/benchmark_generator.py` — 新增:从技能配置自动生成基准用例 + +**Approach**: +1. 新增 `BenchmarkGenerator` 类,读取 `configs/skills/*.yaml`,提取每个技能的 `intent.keywords`、`intent.description`、`intent.examples`,自动生成 `BenchmarkCase` +2. 为每个技能生成3-5条基准用例:1条原始输入 + 2-4条改写 +3. 保留手动定义的边界用例(问候语、身份识别、无匹配回退) +4. 新增维度:`alignment`(对齐守卫)、`semantic_router`(语义路由专项) +5. 总目标:路由20+、执行15+、团队10+、一致性10+、对齐守卫5+ + +**Patterns to follow**: `BenchmarkCase` Pydantic frozen model 模式 + +**Test scenarios**: +- 生成的基准用例 expected_skill 全部存在于 configs/skills/ 中 +- 每个技能至少有1条基准用例 +- paraphrases 非空的用例占比 > 60% +- 总用例数 >= 60 + +**Verification**: 运行 `python -c "from tests.e2e.benchmark_dataset import ALL_BENCHMARKS; print(len(ALL_BENCHMARKS))"` 确认 >= 60 + +### U2. 路由器直接回测层 + +**Goal**: 增加不经过HTTP API的路由器直接回测,记录完整路由结果 + +**Dependencies**: U1 + +**Files**: +- `tests/e2e/test_capability_router_direct.py` — 新增:路由器直接回测 +- `tests/e2e/conftest.py` — 增加 router fixture + +**Approach**: +1. 在 conftest.py 中增加 `cost_aware_router` fixture,直接实例化 `CostAwareRouter`(使用 MockLLMProvider) +2. 新增 `test_capability_router_direct.py`,对每个基准用例调用 `router.route(query)` 并记录完整 `SkillRoutingResult` +3. 记录字段:`skill_name`、`execution_mode`、`complexity`、`match_method`(layer0/layer1/layer1.5/layer2)、`match_confidence`、`execution_trace` +4. 将路由器回测结果也写入 MetricsCollector,增加 `match_method` 和 `match_confidence` 字段 + +**Patterns to follow**: 现有 `test_capability_routing.py` 的参数化测试模式 + +**Test scenarios**: +- Layer 0 规则匹配:问候语 → DIRECT_CHAT,@skill:xxx → 对应技能 +- Layer 1 复杂度分类:简单问答 → low,多步分析 → high +- Layer 1.5 语义路由:同义改写 → 相同技能,相似度 > 0.6 +- Layer 2 能力匹配:高复杂度 → REACT/TEAM_COLLAB +- 路由器回测与API回测结果一致性 > 90% + +**Verification**: 运行 `pytest tests/e2e/test_capability_router_direct.py -v` 全部通过 + +### U3. 指标体系增强 + +**Goal**: 修复 complexity_correct 判断逻辑,增加语义路由/团队路由指标,修正 target_module 路径 + +**Dependencies**: U1 + +**Files**: +- `tests/e2e/capability_metrics.py` — 增强指标模型和分析器 +- `tests/e2e/benchmark_dataset.py` — 增加 semantic_router / alignment 类别 + +**Approach**: +1. `CapabilityObservation` 增加 `actual_complexity_score: float | None`、`actual_match_method: str | None`、`actual_match_confidence: float | None` 字段 +2. `complexity_correct` 改为基于分数区间映射判断(KTD3) +3. `MetricsAnalyzer` 增加 `analyze_semantic_router()` 方法:按 high/medium/low 三档统计精确率 +4. `MetricsAnalyzer` 增加 `analyze_team_routing()` 方法:统计 `explicit_team` vs `complexity_suggestion` 的成功率 +5. 修正 `plan_improvements()` 中所有 `target_module`:`cost_aware_router.py` → `chat/skill_routing.py` +6. 报告增加"语义路由分析"和"团队路由分析"章节 + +**Patterns to follow**: 现有 `MetricsAnalyzer` 的分析方法模式 + +**Test scenarios**: +- complexity_correct 独立于 execution_mode_correct +- 语义路由三档精确率计算正确 +- 团队路由成功率计算正确 +- target_module 路径与实际代码对应 +- 中文报告输出包含新增章节 + +**Verification**: 运行 `pytest tests/e2e/test_capability_routing.py tests/e2e/test_capability_react.py -v` 通过 + +### U4. ExpertTeamRouter 集成到 CostAwareRouter + +**Goal**: 高复杂度任务自动触发 TEAM_COLLAB 模式 + +**Dependencies**: U2 + +**Files**: +- `src/agentkit/chat/skill_routing.py` — 修改 `_route_layer2()` 增加团队升级逻辑 +- `src/agentkit/experts/router.py` — 增加 `can_handle()` 方法供路由器查询 +- `tests/unit/chat/test_skill_routing.py` — 增加团队路由单元测试 + +**Approach**: +1. 在 `CostAwareRouter._route_layer2()` 末尾,当 `execution_mode == REACT` 且 `complexity >= COMPLEXITY_THRESHOLD` 时,调用 `ExpertTeamRouter.resolve(content, complexity)` +2. 如果 `ExpertTeamRouter` 返回有效结果,升级 `execution_mode` 为 `TEAM_COLLAB`,并在 `execution_trace` 中记录 `"team_upgrade": True` +3. 在 `ExpertTeamRouter` 中增加 `can_handle(content: str) -> bool` 方法,检查是否有匹配的专家模板 +4. 保持向后兼容:如果 `ExpertTeamRouter` 不可用(未配置专家模板),静默跳过 + +**Patterns to follow**: 现有 `_route_layer2()` 的 Vickrey 拍卖路径模式 + +**Test scenarios**: +- 高复杂度 + 有专家模板 → TEAM_COLLAB +- 高复杂度 + 无专家模板 → 保持 REACT +- 低复杂度 → 不触发团队路由 +- @team 前缀 → 直接 TEAM_COLLAB(Layer 0 处理) +- execution_trace 包含 team_upgrade 标记 + +**Verification**: 运行 `pytest tests/unit/chat/test_skill_routing.py -v -k team` 通过 + +### U5. AlignmentGuard 与 CascadeDetector 指标集成 + +**Goal**: 将对齐守卫约束违规和级联告警纳入E2E指标收集 + +**Dependencies**: U3 + +**Files**: +- `tests/e2e/test_capability_alignment.py` — 新增:对齐守卫基准测试 +- `tests/e2e/capability_metrics.py` — 增加 alignment 维度指标 + +**Approach**: +1. 新增 `test_capability_alignment.py`,包含5+条对齐守卫基准用例: + - 否定约束测试("不要提及价格"→ 输出不含价格) + - 肯定约束测试("必须包含摘要"→ 输出含摘要) + - 级联告警测试(连续5次相似查询 → 触发 CascadeAlert) +2. `CapabilityObservation` 增加 `alignment_violations: int`、`cascade_alert: bool` 字段 +3. `MetricsAnalyzer` 增加 `analyze_alignment()` 方法 +4. 报告增加"对齐守卫分析"章节 + +**Patterns to follow**: 现有 `test_capability_team.py` 的测试模式 + +**Test scenarios**: +- 否定约束:输出不包含禁止内容 +- 肯定约束:输出包含必要内容 +- 级联告警:连续交互触发告警 +- 无约束:正常通过 + +**Verification**: 运行 `pytest tests/e2e/test_capability_alignment.py -v` 通过 + +### U6. 运行脚本与CI集成 + +**Goal**: 更新运行脚本,支持分层回测和CI集成 + +**Dependencies**: U2, U3, U4, U5 + +**Files**: +- `scripts/run_e2e.sh` — 增加直接回测和分层运行选项 +- `tests/e2e/conftest.py` — 确保 pytest_sessionfinish 报告生成正确 + +**Approach**: +1. `run_e2e.sh` 增加 `--direct` 选项(仅运行路由器直接回测) +2. `run_e2e.sh` 增加 `--alignment` 选项(仅运行对齐守卫测试) +3. `run_e2e.sh` 增加 `--full` 选项(运行全部:API + 直接 + 对齐) +4. 确保报告输出目录 `test-results/e2e/` 在 CI 中作为 artifact 上传 +5. 增加 `--baseline` 选项:与上次报告对比,输出指标变化趋势 + +**Patterns to follow**: 现有 `run_e2e.sh` 的选项模式 + +**Test scenarios**: +- `--direct` 仅运行路由器直接回测 +- `--alignment` 仅运行对齐守卫测试 +- `--full` 运行所有能力测试 +- `--analyze` 生成完整中文报告 +- 报告文件正确保存到 test-results/e2e/ + +**Verification**: 运行 `./scripts/run_e2e.sh --direct` 和 `./scripts/run_e2e.sh --analyze` 验证 + +--- + +## Scope Boundaries + +### In Scope +- 基准数据集与实际技能对齐并扩展至60条 +- 路由器直接回测层 +- 指标体系增强(复杂度、语义路由、团队路由) +- ExpertTeamRouter 集成到 CostAwareRouter +- AlignmentGuard 指标集成 +- 运行脚本更新 + +### Out of Scope +- CostAwareRouter 三层架构重写 +- 新增 LLM Provider +- 前端界面修改 +- 生产环境部署 +- intent.examples 嵌入到 SemanticRouter(可作为后续优化) +- disambiguation_keywords 配置字段(改进策略已规划,但属于技能配置层面的独立改进) + +### Deferred to Follow-Up Work +- 基于用户真实查询日志的基准用例持续扩充 +- 复杂度评估模型训练(替代启发式规则) +- 意图泛化CI防线的 GitHub Actions 配置 +- OutputStandardizer.quality_score 与路由决策的关联分析 + +--- + +## Risks & Mitigations + +| 风险 | 影响 | 缓解措施 | +|------|------|----------| +| ExpertTeamRouter 集成可能影响现有路由性能 | Layer 2 增加一次 resolve() 调用 | 仅在 complexity >= 0.7 时触发,且 can_handle() 快速返回 | +| 基准用例自动生成可能产生低质量用例 | PRF 指标失真 | 人工审核自动生成的用例,保留手动边界用例 | +| 路由器直接回测需要 MockLLMProvider 完整支持 | 某些路由路径无法测试 | 优先覆盖 Layer 0/1,Layer 1.5/2 标记为需要真实 LLM | +| 60条用例可能增加E2E运行时间 | CI 流水线变慢 | 按维度分组运行,支持 `--fast` 快速失败模式 | + +--- + +## System-Wide Impact + +- **路由层**:`skill_routing.py` 增加 ExpertTeamRouter 调用点,影响所有高复杂度请求的路由决策 +- **测试层**:新增3个测试文件,conftest.py 增加2个 fixture,运行脚本增加4个选项 +- **报告层**:能力分析报告增加3个章节(语义路由、团队路由、对齐守卫) +- **配置层**:无配置文件变更(disambiguation_keywords 推迟到后续) diff --git a/docs/plans/2026-06-15-003-feat-router-intelligence-optimization-plan.md b/docs/plans/2026-06-15-003-feat-router-intelligence-optimization-plan.md new file mode 100644 index 0000000..9b75b62 --- /dev/null +++ b/docs/plans/2026-06-15-003-feat-router-intelligence-optimization-plan.md @@ -0,0 +1,326 @@ +--- +title: "feat: 路由智能化优化 — 复杂度校准、意图消歧、质量门控增强" +status: active +created: 2026-06-15 +updated: 2026-06-15 +origin: test-results/e2e/capability_report.txt (真实LLM回测分析报告) +--- + +## Summary + +基于真实 LLM 回测分析报告暴露的三个核心根因,优化 CostAwareRouter 的路由智能化水平:修复 HeuristicClassifier 复杂度评分偏差(执行模式准确率从 9.09% 提升至 >30%),解决 IntentRouter 首次匹配导致的技能混淆(技能路由 F1 从 66.67% 提升至 >80%),增强 QualityGate 的技能匹配验证拦截错误路由。 + +**当前进度**: U1 代码已实现,待补单元测试;U2/U3 待实现;U4 待验证。 + +--- + +## Problem Frame + +真实 LLM 回测(74个观测)揭示三个核心问题: + +1. **执行模式准确率 9.09%** — HeuristicClassifier 倾向高估复杂度,将简单问答(如"你好"、"你是谁")判为需要 REACT 而非 DIRECT_CHAT。40个执行模式判断错误中仅1次低估复杂度。 +2. **keyword_match 召回率 0%** — 62个关键词匹配用例全部未路由到预期技能,真实 SkillRegistry 虽然加载了15个技能,但路由链路未能正确匹配。 +3. **意图歧义** — plan_exec_agent 与 goal_driven_agent 的关键词重叠("规划"、"报告"子串),IntentRouter 首次匹配策略导致混淆。 + +--- + +## Requirements + +- R1: HeuristicClassifier 复杂度评分校准 — 简单问答应得低分(<0.3),复杂任务应得高分(>0.7) +- R2: IntentRouter 多候选评分排序 — 匹配多个技能时按得分排序选择最佳,而非首次匹配 +- R3: QualityGate 技能匹配验证 — 拦截路由结果与技能能力不一致的输出 +- R4: 回测验证 — 改进后执行模式准确率 >30%,技能路由 F1 >80% + +--- + +## Key Technical Decisions + +### KTD1: HeuristicClassifier 评分重构 — 增加低复杂度信号 + +**决策**: 在现有高/中复杂度关键词之外,增加低复杂度关键词列表和否定信号机制。当输入包含低复杂度信号(问候、闲聊、简单定义)时,直接降低基础分数;当高复杂度词出现在否定上下文("不要X"、"无需X")时,不增加分数。 + +**理由**: 当前分类器只有正向累加逻辑(命中高复杂度词→加分),没有负向扣减逻辑。这导致任何包含"分析"、"搜索"等常见动词的输入都被判为高复杂度,即使实际是简单问答。 + +**替代方案**: 用 LLM 替代规则分类器 — 延迟高(~500ms)、成本高(~100 tokens),且当前 merged_llm_classify 已在 0.3-0.7 区间使用 LLM,规则层应保持零成本。 + +**实现状态**: 代码已完成。`classify()` 方法已重写,包含低复杂度信号优先检测、否定上下文排除、阈值调整(0.15→0.10, 0.45→0.35)、短疑问句扣减。 + +### KTD2: IntentRouter 多候选评分排序 + +**决策**: 修改 `_match_keywords()` 从"首次匹配返回"改为"收集所有匹配候选,按匹配关键词数量×关键词长度排序,返回最佳匹配"。 + +**理由**: 首次匹配依赖 skills 列表遍历顺序,不可控且不公平。多候选评分让匹配更多、更精确关键词的技能胜出。例如输入"规划一个调研报告"同时匹配 plan_exec_agent("规划"、"报告")和 goal_driven_agent("规划"、"调研"),但 goal_driven_agent 还匹配"生成报告"的子串"报告",匹配数相同则按关键词长度排序,更长的关键词("调研报告" > "报告")权重更高。 + +**替代方案**: 在技能配置中添加互斥关键词 — 需要逐对配置,维护成本高,且无法覆盖所有重叠场景。 + +**实现状态**: 待实现。当前 `_match_keywords()` 仍为首次匹配逻辑(`intent.py` L89-98)。 + +### KTD3: QualityGate 技能匹配验证 — 轻量级路由一致性检查 + +**决策**: 在 QualityGate.validate() 中增加可选的 `skill_context` 参数,当提供时检查输出内容是否与路由到的技能的能力范围一致。使用规则检查(关键词覆盖度)而非 LLM 语义检查,保持零额外成本。 + +**理由**: 当前 QualityGate 只检查输出格式(必填字段、字数、Schema),不检查输出内容是否与路由技能匹配。3个用例虽然 HTTP 成功但路由到了错误技能,质量门控未能拦截。 + +**实现状态**: 待实现。当前 `validate()` 仅有四维度检查(`gate.py` L37-114)。 + +--- + +## Scope Boundaries + +### In Scope +- HeuristicClassifier 评分逻辑优化(代码已完成,待补测试) +- IntentRouter._match_keywords() 多候选评分排序 +- QualityGate 增加技能匹配验证维度 +- 更新回测基准数据集以反映新的评分逻辑 +- 改进后重跑回测验证 + +### Out of Scope +- LLM 分类器优化(merged_llm_classify 和 _classify_with_llm 已有实现,不在本次优化范围) +- SemanticRouter 优化(需要嵌入模型,属于独立优化方向) +- ExpertTeamRouter 在服务器启动时的注入(已实现但未接入 create_app,属于部署配置问题) +- 新增技能配置文件 + +### Deferred to Follow-Up Work +- 训练专用意图分类模型替代规则匹配(长期方向) +- 构建复杂度校准数据集持续优化阈值 +- 实现自动质量回归检测 CI 流水线 + +--- + +## Implementation Units + +### U1. HeuristicClassifier 复杂度评分校准 + +**Goal**: 修复复杂度评分偏差,使简单问答得低分、复杂任务得高分,提升执行模式准确率 + +**Requirements**: R1, R4 + +**Dependencies**: None + +**Files:** +- `src/agentkit/chat/skill_routing.py` — HeuristicClassifier 类(**代码已完成**) +- `tests/unit/chat/test_skill_routing.py` — 新增复杂度校准测试(**待编写**) + +**Approach:** + +代码已实现以下改动: + +1. 增加低复杂度关键词列表 `_LOW_COMPLEXITY_HINTS_CN`(17个词)和 `_LOW_COMPLEXITY_HINTS_EN`(14个词),命中时基础分数为 0.05,且不再累加高复杂度词分数。 + +2. 增加否定上下文检测 `_NEGATION_PATTERNS`,匹配"不要/无需/不用/don't/no need/without"后跟的词,该词不计入高复杂度匹配。 + +3. 调整基础分数阈值:无关键词命中时基础分 0.10(原 0.15),中等复杂度命中基础分 0.35(原 0.45)。 + +4. 增加短疑问句检测 `_SHORT_QUESTION_RE`:以"?"或"?"结尾且长度 <30 字符时,额外 -0.10。 + +**剩余工作**: 编写单元测试验证分类器行为。 + +**Patterns to follow:** 现有 `test_skill_routing.py` 中的测试类结构(`TestExpertTeamRouterCanHandle` 等) + +**Test scenarios:** + +- **低复杂度信号优先检测** + - "你好" → 复杂度 < 0.3(命中 `_LOW_COMPLEXITY_HINTS_CN`) + - "Hello" → 复杂度 < 0.3(命中 `_LOW_COMPLEXITY_HINTS_EN`) + - "嗨,早上好" → 复杂度 < 0.3(多个低复杂度词命中) + - "你好,请帮我分析一下这个数据" → 复杂度 < 0.15(低复杂度信号优先,不累加高复杂度词) + +- **身份查询** + - "你是谁" → 复杂度 < 0.3 + - "你叫什么" → 复杂度 < 0.3 + +- **否定上下文排除** + - "不要搜索" → "搜索"不计入高复杂度匹配,复杂度 < 0.3 + - "无需分析,直接告诉我答案" → "分析"被否定,复杂度 < 0.3 + - "分析市场趋势,但不要搜索" → "搜索"被否定但"分析"未被否定,复杂度 > 0.5 + +- **阈值调整验证** + - 无关键词的短消息("好的")→ 复杂度 ≤ 0.10 + - 含中等复杂度词("如何使用Python?")→ 基础分 0.35 而非 0.45 + +- **短疑问句扣减** + - "怎么用?" → 复杂度 < 0.3(短疑问句 -0.10) + - "如何设计一个高可用的微服务架构?" → 复杂度 > 0.5(长疑问句不扣减) + +- **复杂任务高分** + - "分析市场趋势并生成报告" → 复杂度 > 0.7(2个高复杂度词命中) + - "执行部署脚本并重启服务" → 复杂度 > 0.7 + +- **边界条件** + - 空字符串 → 复杂度 0.0 + - 纯空格 → 复杂度 0.0 + - 超长低复杂度消息(>200字符的问候)→ 复杂度 ≤ 0.10 + +**Verification:** `pytest tests/unit/chat/test_skill_routing.py -v`,所有 HeuristicClassifier 测试通过 + +--- + +### U2. IntentRouter 多候选评分排序 + +**Goal**: 解决首次匹配导致的技能混淆,使匹配更精确的技能胜出 + +**Requirements**: R2, R4 + +**Dependencies**: None + +**Files:** +- `src/agentkit/router/intent.py` — IntentRouter._match_keywords() +- `tests/unit/router/test_intent.py` — 新建多候选排序测试 + +**Approach:** + +1. 重写 `_match_keywords()` 方法(当前为 `intent.py` L75-99): + + 当前逻辑(首次匹配): + ``` + for skill in skills: + for keyword in keywords: + if keyword in combined_text: + return RoutingResult(matched_skill=skill.name, ...) + return None + ``` + + 改为多候选评分: + ``` + candidates = [] + for skill in skills: + matched_kws = [kw for kw in skill.config.intent.keywords if kw.lower() in combined_text] + if matched_kws: + score = sum(len(kw) for kw in matched_kws) # 更长关键词权重更高 + candidates.append((skill, matched_kws, score)) + if not candidates: + return None + candidates.sort(key=lambda c: (-c[2], c[0].name)) # 得分降序,同名字母序 + best_skill, best_kws, best_score = candidates[0] + confidence = min(1.0, 0.5 + 0.1 * len(best_kws)) + return RoutingResult(matched_skill=best_skill.name, method="keyword", confidence=confidence) + ``` + +2. 保持 `RoutingResult` 数据类接口不变,`method` 仍为 `"keyword"`。 + +3. 向后兼容:单候选时行为与原来一致(只有一个 skill 匹配时,排序无影响)。 + +4. 需要创建 `tests/unit/router/` 目录和 `__init__.py`。 + +**Patterns to follow:** 现有 `RoutingResult` 数据类结构;`_extract_string_values()` 的输入处理方式 + +**Test scenarios:** + +- **单候选匹配** — 输入只匹配一个 skill 的关键词,行为与原来一致,confidence=1.0 +- **多候选匹配 — 得分不同** — 输入同时匹配 skill_a(关键词"规划"2字)和 skill_b(关键词"调研报告"4字),skill_b 得分更高应胜出 +- **多候选匹配 — 得分相同** — 两个 skill 得分相同时,按名称字母序稳定排序 +- **无匹配** — 无任何关键词命中,返回 None +- **空关键词列表** — skill 的 intent.keywords 为空列表,不参与匹配 +- **大小写不敏感** — 英文关键词 "Search" 应匹配 "search" +- **子串匹配行为** — 中文关键词"报告"应匹配包含"报告"的输入(保持现有子串匹配语义) +- **confidence 计算** — 匹配1个关键词 confidence=0.6,匹配3个 confidence=0.8,上限 1.0 + +**Verification:** `pytest tests/unit/router/test_intent.py -v`,多候选排序测试通过 + +--- + +### U3. QualityGate 技能匹配验证 + +**Goal**: 增加路由一致性检查,拦截技能匹配错误的低质量输出 + +**Requirements**: R3, R4 + +**Dependencies:** None + +**Files:** +- `src/agentkit/quality/gate.py` — QualityGate.validate() +- `tests/unit/quality/test_gate.py` — 新建技能匹配验证测试 + +**Approach:** + +1. 在 `QualityGate.validate()` 签名中增加可选参数 `skill_context: dict | None = None`: + ```python + async def validate( + self, + output: dict[str, Any], + skill: Skill, + skill_context: dict | None = None, # 新增 + ) -> QualityResult: + ``` + +2. `skill_context` 结构:`{"skill_name": str, "intent_keywords": list[str]}` + +3. 当 `skill_context` 提供且 `intent_keywords` 非空时,增加第五维度检查"技能匹配验证": + - 将 output 中所有字符串值拼接 + - 检查拼接文本是否包含至少一个 `intent_keywords` 中的关键词(子串匹配) + - 如果 0 个关键词匹配 → `QualityCheck(name="skill_match", passed=True, message="Warning: output may not match routed skill")` — 警告但不拦截 + - 如果 ≥ 1 个关键词匹配 → `QualityCheck(name="skill_match", passed=True)` — 静默通过 + +4. 警告升级为失败的组合逻辑:当 `skill_match` 警告存在且其他任何维度检查失败时,`skill_match` 的 `passed` 也变为 `False`,导致整体 `passed=False`。 + +5. 保持向后兼容:`skill_context` 为 None 或缺少 `intent_keywords` 时,行为与原来完全一致(四维度检查)。 + +**Patterns to follow:** 现有四维度检查模式(`gate.py` L50-114);`QualityCheck` 数据类 + +**Test scenarios:** + +- **无 skill_context** — 行为与原来一致,仅四维度检查 +- **skill_context=None** — 等同于无 skill_context +- **skill_context 缺少 intent_keywords** — 等同于无 skill_context +- **有 skill_context 且输出包含关键词** — 通过,无警告消息 +- **有 skill_context 且输出不包含任何关键词** — 通过但有警告消息 +- **输出无关 + 其他维度失败** — skill_match passed=False,整体 passed=False +- **输出无关 + 其他维度全部通过** — skill_match passed=True(仅警告),整体 passed=True +- **空 intent_keywords 列表** — 跳过技能匹配检查 + +**Verification:** `pytest tests/unit/quality/test_gate.py -v`,技能匹配验证测试通过 + +--- + +### U4. 回测验证与基准更新 + +**Goal**: 验证改进效果,更新基准数据集 + +**Requirements**: R4 + +**Dependencies:** U1, U2, U3 + +**Files:** +- `tests/e2e/test_capability_router_direct.py` — 使用真实 LLM 回测 +- `tests/e2e/benchmark_dataset.py` — 可能需要更新预期值 +- `test-results/e2e/capability_report.txt` — 对比改进前后报告 + +**Approach:** + +1. 运行完整回测:`python3 -m pytest tests/e2e/test_capability_router_direct.py -v` + +2. 对比改进前后指标: + - 执行模式准确率:9.09% → 目标 >30% + - 技能路由 F1:66.67% → 目标 >80% + - 任务成功率:100% → 保持 + +3. 如果基准数据集中的预期值因评分逻辑变化需要调整,更新 `benchmark_dataset.py` + +4. 保存改进后报告为基线:`cp test-results/e2e/capability_report.json test-results/e2e/baseline_capability_report.json` + +**Test scenarios:** +- 回测全部通过 +- 执行模式准确率 >30% +- 技能路由 F1 >80% +- 无回归(任务成功率不下降) + +**Verification:** 运行回测并检查报告指标 + +--- + +## Risks & Dependencies + +| 风险 | 影响 | 缓解 | +|------|------|------| +| 复杂度评分调整可能过度修正,导致复杂任务被判为简单 | 高复杂度任务路由到 DIRECT_CHAT,无法使用工具 | 保留 merged_llm_classify 兜底机制,0.3-0.7 区间仍由 LLM 二次确认 | +| 多候选排序可能改变现有路由行为的兼容性 | 已有用户依赖的路由结果可能变化 | 排序逻辑仅在多候选时生效,单候选行为不变 | +| QualityGate 技能匹配验证的"相关词"判断可能误报 | 正常输出被标记为警告 | 使用 warning 级别而非 error,不单独拦截 | +| keyword_match 召回率 0% 的根因可能不仅是 IntentRouter | 即使修复多候选排序,仍可能因技能配置关键词不匹配而召回率低 | U4 回测后若仍低,需进一步分析技能配置与基准用例的对齐度 | + +--- + +## Open Questions + +- 复杂度评分的具体阈值已在代码中设定初始值(0.05/0.10/0.35/0.65/0.80),需通过 U4 回测校准 +- 否定上下文检测的正则模式覆盖度需在回测中验证,可能需要迭代补充 +- keyword_match 召回率 0% 是否完全由 IntentRouter 首次匹配导致,还是技能配置关键词本身与基准用例不对齐 — 需 U2 实现后通过 U4 验证 diff --git a/scripts/run_e2e.sh b/scripts/run_e2e.sh new file mode 100755 index 0000000..e910d5a --- /dev/null +++ b/scripts/run_e2e.sh @@ -0,0 +1,328 @@ +#!/usr/bin/env bash +# ============================================================================= +# Fischer AgentKit — E2E Backtest Runner +# ============================================================================= +# +# Usage: +# ./scripts/run_e2e.sh # Run all E2E tests +# ./scripts/run_e2e.sh --basic # Run basic function tests only +# ./scripts/run_e2e.sh --capability # Run agent capability tests only +# ./scripts/run_e2e.sh --cli # Run CLI tests only +# ./scripts/run_e2e.sh --api # Run API tests only +# ./scripts/run_e2e.sh --ws # Run WebSocket tests only +# ./scripts/run_e2e.sh --routing # Run routing intelligence tests +# ./scripts/run_e2e.sh --react # Run ReAct intelligence tests +# ./scripts/run_e2e.sh --team # Run team collaboration tests +# ./scripts/run_e2e.sh --report # Generate HTML report +# ./scripts/run_e2e.sh --analyze # Run capability tests + generate analysis report +# ./scripts/run_e2e.sh --direct # Run router direct backtest only (no HTTP) +# ./scripts/run_e2e.sh --alignment # Run alignment guard tests only +# ./scripts/run_e2e.sh --full # Run all: API + direct + alignment +# ./scripts/run_e2e.sh --baseline # Compare with last baseline report +# +# Environment: +# E2E_PORT - Server port (default: 18765) +# E2E_API_KEY - API key for auth (default: ak_live_e2e_test_key_...) +# SKIP_SERVER - Set to "1" to skip server startup (use existing) +# ============================================================================= + +set -euo pipefail + +# ── Configuration ──────────────────────────────────────────────────────────── + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +E2E_PORT="${E2E_PORT:-18765}" +E2E_API_KEY="${E2E_API_KEY:-ak_live_e2e_test_key_000000000000000000000000000000000000000000000000}" +REPORT_DIR="${PROJECT_ROOT}/test-results/e2e" +SKIP_SERVER="${SKIP_SERVER:-0}" + +cd "$PROJECT_ROOT" + +# ── Colors ─────────────────────────────────────────────────────────────────── + +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# ── Helper Functions ───────────────────────────────────────────────────────── + +info() { echo -e "${BLUE}[INFO]${NC} $*"; } +ok() { echo -e "${GREEN}[OK]${NC} $*"; } +warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } +fail() { echo -e "${RED}[FAIL]${NC} $*"; } + +check_deps() { + local missing=0 + for cmd in python3; do + if ! command -v "$cmd" &>/dev/null; then + fail "Missing dependency: $cmd" + missing=1 + fi + done + if [ "$missing" -eq 1 ]; then + exit 1 + fi +} + +wait_for_server() { + local max_attempts=60 + local attempt=0 + info "Waiting for server on port $E2E_PORT..." + while [ $attempt -lt $max_attempts ]; do + if curl -s "http://127.0.0.1:$E2E_PORT/api/v1/health" &>/dev/null; then + ok "Server is ready on port $E2E_PORT" + return 0 + fi + attempt=$((attempt + 1)) + sleep 0.5 + done + fail "Server failed to start within 30 seconds" + return 1 +} + +start_server() { + if [ "$SKIP_SERVER" = "1" ]; then + info "SKIP_SERVER=1, using existing server on port $E2E_PORT" + if curl -s "http://127.0.0.1:$E2E_PORT/api/v1/health" &>/dev/null; then + ok "Existing server is healthy" + return 0 + else + fail "Existing server is not responding" + return 1 + fi + fi + + info "Starting AgentKit E2E server on port $E2E_PORT..." + export AGENTKIT_E2E_MODE=1 + export AGENTKIT_WS_TIMEOUT=0 + export AGENTKIT_API_KEY="$E2E_API_KEY" + + # Start server in background + python3 -m agentkit.cli.main serve --host 127.0.0.1 --port "$E2E_PORT" & + SERVER_PID=$! + + if wait_for_server; then + return 0 + else + kill "$SERVER_PID" 2>/dev/null || true + return 1 + fi +} + +stop_server() { + if [ "$SKIP_SERVER" = "1" ]; then + info "SKIP_SERVER=1, not stopping server" + return 0 + fi + if [ -n "${SERVER_PID:-}" ]; then + info "Stopping E2E server (PID: $SERVER_PID)..." + kill "$SERVER_PID" 2>/dev/null || true + wait "$SERVER_PID" 2>/dev/null || true + ok "Server stopped" + fi +} + +# ── Test Selection ─────────────────────────────────────────────────────────── + +PYTEST_ARGS=("--timeout=120" "-v" "--tb=short" "-s") +TEST_TARGET="tests/e2e/" +GENERATE_REPORT=0 +ANALYZE=0 +SKIP_SERVER_FLAG=0 +BASELINE_COMPARE=0 + +while [[ $# -gt 0 ]]; do + case $1 in + --basic) + PYTEST_ARGS+=("-m" "e2e_basic") + shift + ;; + --capability) + PYTEST_ARGS+=("-m" "e2e_capability") + shift + ;; + --cli) + TEST_TARGET="tests/e2e/test_basic_cli.py" + shift + ;; + --api) + TEST_TARGET="tests/e2e/test_basic_api.py" + shift + ;; + --ws) + TEST_TARGET="tests/e2e/test_basic_websocket.py" + shift + ;; + --routing) + TEST_TARGET="tests/e2e/test_capability_routing.py" + shift + ;; + --react) + TEST_TARGET="tests/e2e/test_capability_react.py" + shift + ;; + --team) + TEST_TARGET="tests/e2e/test_capability_team.py" + shift + ;; + --direct) + # Router direct backtest — no HTTP server needed + TEST_TARGET="tests/e2e/test_capability_router_direct.py" + SKIP_SERVER_FLAG=1 + shift + ;; + --alignment) + # Alignment guard tests — no HTTP server needed + TEST_TARGET="tests/e2e/test_capability_alignment.py" + SKIP_SERVER_FLAG=1 + shift + ;; + --full) + # Run all capability tests: API + direct + alignment + PYTEST_ARGS+=("-m" "e2e_capability") + shift + ;; + --baseline) + BASELINE_COMPARE=1 + shift + ;; + --report) + GENERATE_REPORT=1 + shift + ;; + --analyze) + ANALYZE=1 + PYTEST_ARGS+=("-m" "e2e_capability") + shift + ;; + --fast) + PYTEST_ARGS+=("-x" "--timeout=30") + shift + ;; + --help|-h) + echo "Usage: $0 [--basic|--capability|--cli|--api|--ws|--routing|--react|--team|--direct|--alignment|--full|--baseline|--report|--analyze|--fast]" + exit 0 + ;; + *) + PYTEST_ARGS+=("$1") + shift + ;; + esac +done + +if [ "$GENERATE_REPORT" -eq 1 ]; then + mkdir -p "$REPORT_DIR" + PYTEST_ARGS+=( + "--html=$REPORT_DIR/e2e_report.html" + "--self-contained-html" + "--junitxml=$REPORT_DIR/e2e_junit.xml" + ) +fi + +if [ "$ANALYZE" -eq 1 ]; then + info "Analysis mode: will generate capability report with recall/F1/overfitting analysis" +fi + +# Override SKIP_SERVER when --direct or --alignment is used (no HTTP needed) +if [ "$SKIP_SERVER_FLAG" -eq 1 ]; then + SKIP_SERVER=1 +fi + +# ── Main ───────────────────────────────────────────────────────────────────── + +info "Fischer AgentKit E2E Backtest Runner" +info "=====================================" +info "Project: $PROJECT_ROOT" +info "Port: $E2E_PORT" +info "Target: $TEST_TARGET" +info "" + +check_deps + +# Trap to ensure server cleanup +trap stop_server EXIT INT TERM + +if ! start_server; then + fail "Could not start E2E server" + exit 1 +fi + +info "" +info "Running E2E tests..." +info "====================" +info "" + +export AGENTKIT_SERVER_URL="http://127.0.0.1:$E2E_PORT" +export AGENTKIT_API_KEY="$E2E_API_KEY" + +EXIT_CODE=0 +python3 -m pytest "$TEST_TARGET" "${PYTEST_ARGS[@]}" || EXIT_CODE=$? + +echo "" +if [ $EXIT_CODE -eq 0 ]; then + ok "All E2E tests passed!" +else + fail "Some E2E tests failed (exit code: $EXIT_CODE)" +fi + +if [ "$GENERATE_REPORT" -eq 1 ]; then + info "Report generated at: $REPORT_DIR/e2e_report.html" +fi + +if [ "$ANALYZE" -eq 1 ]; then + CAPABILITY_REPORT="$PROJECT_ROOT/test-results/e2e/capability_report.txt" + if [ -f "$CAPABILITY_REPORT" ]; then + info "Capability analysis report:" + echo "" + cat "$CAPABILITY_REPORT" + else + warn "Capability report not found (may need capability tests to run first)" + fi +fi + +if [ "$BASELINE_COMPARE" -eq 1 ]; then + CURRENT_REPORT="$PROJECT_ROOT/test-results/e2e/capability_report.json" + BASELINE_REPORT="$PROJECT_ROOT/test-results/e2e/baseline_capability_report.json" + if [ -f "$CURRENT_REPORT" ] && [ -f "$BASELINE_REPORT" ]; then + info "Baseline comparison:" + python3 -c " +import json, sys + +def load_metrics(path): + with open(path) as f: + return json.load(f) + +cur = load_metrics('$CURRENT_REPORT') +base = load_metrics('$BASELINE_REPORT') + +metrics = [ + ('overall_skill_recall', '技能路由召回率'), + ('overall_skill_precision', '技能路由精确率'), + ('overall_skill_f1', '技能路由F1'), + ('overall_execution_mode_accuracy', '执行模式准确率'), + ('overall_task_success_rate', '任务成功率'), + ('overfitting_score', '过拟合分数'), +] + +print() +for key, label in metrics: + c = cur.get(key, 0) + b = base.get(key, 0) + delta = c - b + arrow = '↑' if delta > 0 else ('↓' if delta < 0 else '→') + print(f' {label}: {b:.2%} → {c:.2%} {arrow} {delta:+.2%}') +print() +" + elif [ -f "$CURRENT_REPORT" ]; then + info "No baseline report found. Saving current report as baseline." + cp "$CURRENT_REPORT" "$BASELINE_REPORT" + info "Baseline saved to: $BASELINE_REPORT" + else + warn "No current report found. Run with --analyze first." + fi +fi + +exit $EXIT_CODE diff --git a/src/agentkit/chat/skill_routing.py b/src/agentkit/chat/skill_routing.py index 36483a3..4e2ec8b 100644 --- a/src/agentkit/chat/skill_routing.py +++ b/src/agentkit/chat/skill_routing.py @@ -33,9 +33,31 @@ class ExecutionMode(enum.Enum): DIRECT_CHAT = "direct_chat" # Zero-cost: direct LLM call, no ReAct loop REACT = "react" # Default agent ReAct loop with default tools SKILL_REACT = "skill_react" # Skill-matched ReAct with skill tools + prompt + REWOO = "rewoo" # Plan-without-observation mode + REFLEXION = "reflexion" # Reflection-driven mode + PLAN_EXEC = "plan_exec" # Plan-then-execute mode TEAM_COLLAB = "team_collab" # Expert Team collaborative mode +# Mapping from skill config execution_mode string to ExecutionMode enum +_SKILL_EXECUTION_MODE_MAP: dict[str, ExecutionMode] = { + "direct": ExecutionMode.DIRECT_CHAT, + "react": ExecutionMode.SKILL_REACT, + "rewoo": ExecutionMode.REWOO, + "reflexion": ExecutionMode.REFLEXION, + "plan_exec": ExecutionMode.PLAN_EXEC, + "custom": ExecutionMode.SKILL_REACT, + "llm_generate": ExecutionMode.SKILL_REACT, + "tool_call": ExecutionMode.SKILL_REACT, +} + + +def _resolve_execution_mode(skill_config: Any) -> ExecutionMode: + """Resolve ExecutionMode from skill config's execution_mode field.""" + mode_str = getattr(skill_config, "execution_mode", "react") or "react" + return _SKILL_EXECUTION_MODE_MAP.get(mode_str, ExecutionMode.SKILL_REACT) + + def validate_skill_name(name: str) -> str: """Validate and normalize a skill name. Raises ValueError on invalid input.""" normalized = name.strip().lower() @@ -265,7 +287,8 @@ async def resolve_skill_routing( else default_model ) result.agent_name = result.skill_name - result.execution_mode = ExecutionMode.SKILL_REACT + # Map skill.config.execution_mode to ExecutionMode enum + result.execution_mode = _resolve_execution_mode(result.skill_config) else: result.system_prompt = default_system_prompt result.tools = default_tools @@ -596,21 +619,10 @@ class HeuristicClassifier: content_lower = content.lower() score = 0.0 - # 0. 低复杂度信号检测(优先级最高) + # 0. 低复杂度信号检测(仅在无高复杂度信号时生效) low_hits_cn = sum(1 for h in self._LOW_COMPLEXITY_HINTS_CN if h in content_lower) - low_hits_en = sum( - 1 for h in self._LOW_COMPLEXITY_HINTS_EN if h in content_lower - ) - if low_hits_cn + low_hits_en > 0: - score = 0.05 # 问候/闲聊直接给极低分 - # 低复杂度信号下不再累加高复杂度词的分数 - # 但仍保留长度和多句的微调 - length = len(content) - if length > 200: - score += 0.05 - elif length > 100: - score += 0.03 - return max(0.0, min(1.0, score)) + low_hits_en = sum(1 for h in self._LOW_COMPLEXITY_HINTS_EN if h in content_lower) + has_low_signal = low_hits_cn + low_hits_en > 0 # 1. 否定上下文检测 — 提取被否定的词 negated_words: set[str] = set() @@ -624,21 +636,27 @@ class HeuristicClassifier: for h in self._HIGH_COMPLEXITY_HINTS_CN if h in content_lower and h not in negated_words ) - medium_hits = sum( - 1 for m in self._MEDIUM_COMPLEXITY_HINTS_CN if m in content_lower - ) + medium_hits = sum(1 for m in self._MEDIUM_COMPLEXITY_HINTS_CN if m in content_lower) # 英文:词边界匹配 - high_en_matches = self._HIGH_EN_RE.findall(content) + self._HIGH_EXACT_RE.findall( - content - ) - high_hits += sum( - 1 for w in high_en_matches if w.lower() not in negated_words - ) + high_en_matches = self._HIGH_EN_RE.findall(content) + self._HIGH_EXACT_RE.findall(content) + high_hits += sum(1 for w in high_en_matches if w.lower() not in negated_words) medium_hits += len(self._MEDIUM_EN_RE.findall(content)) + len( self._MEDIUM_EXACT_RE.findall(content) ) + has_high_signal = high_hits > 0 or medium_hits > 0 + + # 低复杂度信号仅在无高/中复杂度信号时生效 + if has_low_signal and not has_high_signal: + score = 0.05 # 问候/闲聊直接给极低分 + length = len(content) + if length > 200: + score += 0.05 + elif length > 100: + score += 0.03 + return max(0.0, min(1.0, score)) + if high_hits >= 2: score = 0.80 elif high_hits == 1: diff --git a/src/agentkit/quality/gate.py b/src/agentkit/quality/gate.py index 25473fd..9af94a3 100644 --- a/src/agentkit/quality/gate.py +++ b/src/agentkit/quality/gate.py @@ -38,6 +38,7 @@ class QualityGate: self, output: dict[str, Any], skill: Skill, + skill_context: dict[str, Any] | None = None, ) -> QualityResult: """对产出执行多维度质量检查 @@ -46,6 +47,7 @@ class QualityGate: 2. 最低字数检查 3. JSON Schema 验证(如 skill.config.output_schema 存在) 4. 自定义验证器(如 skill.config.quality_gate.custom_validator 存在) + 5. 技能匹配验证(如 skill_context 含 intent_keywords) """ checks: list[QualityCheck] = [] qg = skill.config.quality_gate @@ -53,11 +55,13 @@ class QualityGate: # 1. 必填字段检查 for field in qg.required_fields: present = field in output and output[field] is not None - checks.append(QualityCheck( - name=f"required_field:{field}", - passed=present, - message=f"Field '{field}' is missing" if not present else None, - )) + checks.append( + QualityCheck( + name=f"required_field:{field}", + passed=present, + message=f"Field '{field}' is missing" if not present else None, + ) + ) # 2. 最低字数检查 if qg.min_word_count > 0: @@ -67,15 +71,17 @@ class QualityGate: else: word_count = len(str(content).split()) passed = word_count >= qg.min_word_count - checks.append(QualityCheck( - name="min_word_count", - passed=passed, - message=( - f"Word count {word_count} < minimum {qg.min_word_count}" - if not passed - else None - ), - )) + checks.append( + QualityCheck( + name="min_word_count", + passed=passed, + message=( + f"Word count {word_count} < minimum {qg.min_word_count}" + if not passed + else None + ), + ) + ) # 3. JSON Schema 验证 if skill.config.output_schema: @@ -101,11 +107,34 @@ class QualityGate: checks.append(QualityCheck(name="custom", passed=bool(result))) except Exception as e: # 验证器导入/执行失败,跳过并记录警告 - checks.append(QualityCheck( - name="custom", - passed=True, - message=f"Validator skipped: {e}", - )) + checks.append( + QualityCheck( + name="custom", + passed=True, + message=f"Validator skipped: {e}", + ) + ) + + # 5. 技能匹配验证(轻量级路由一致性检查) + skill_match_check = self._check_skill_match(output, skill_context) + if skill_match_check is not None: + checks.append(skill_match_check) + + # 警告升级逻辑:当 skill_match 警告存在且其他维度有失败时,升级为失败 + if ( + skill_match_check is not None + and skill_match_check.message + and "Warning" in skill_match_check.message + ): + other_failed = any(not c.passed for c in checks if c is not skill_match_check) + if other_failed: + # 升级:将 skill_match 的 passed 也设为 False + checks = [ + QualityCheck(name=c.name, passed=False, message=c.message) + if c is skill_match_check + else c + for c in checks + ] return QualityResult( passed=all(c.passed for c in checks), @@ -119,6 +148,42 @@ class QualityGate: "app.agent_framework.", ) + @staticmethod + def _check_skill_match( + output: dict[str, Any], + skill_context: dict[str, Any] | None, + ) -> QualityCheck | None: + """第五维度:技能匹配验证 + + 当 skill_context 含 intent_keywords 时,检查输出内容是否包含 + 至少一个关键词。不匹配时标记为警告(passed=True + message), + 当其他维度也有失败时升级为 passed=False。 + + Returns: + QualityCheck 或 None(当 skill_context 无效时跳过) + """ + if not skill_context: + return None + + intent_keywords: list[str] | None = skill_context.get("intent_keywords") + if not intent_keywords: + return None + + # 拼接输出中所有字符串值 + all_text = " ".join( + str(v) for v in output.values() if isinstance(v, (str, int, float, bool)) + ).lower() + + matched = any(kw.lower() in all_text for kw in intent_keywords) + if matched: + return QualityCheck(name="skill_match", passed=True) + + return QualityCheck( + name="skill_match", + passed=True, # 警告级别,不单独拦截 + message="Warning: output may not match routed skill", + ) + def _import_validator(self, dotted_path: str) -> Callable: """从点分路径导入自定义验证器函数 diff --git a/src/agentkit/router/intent.py b/src/agentkit/router/intent.py index 32a3821..ffa85a1 100644 --- a/src/agentkit/router/intent.py +++ b/src/agentkit/router/intent.py @@ -75,10 +75,11 @@ class IntentRouter: def _match_keywords( self, input_data: dict[str, Any], skills: list[Skill] ) -> RoutingResult | None: - """Level 1: 关键词匹配 + """Level 1: 多候选关键词评分匹配 从 input_data 中提取所有字符串值(包括嵌套),对每个 Skill 的 - intent.keywords 进行大小写不敏感匹配。 + intent.keywords 进行大小写不敏感匹配。收集所有匹配候选, + 按匹配关键词总长度(更长关键词权重更高)排序,返回最佳匹配。 """ text_values = self._extract_string_values(input_data) combined_text = " ".join(text_values).lower() @@ -86,17 +87,30 @@ class IntentRouter: if not combined_text: return None + # 收集所有匹配候选 + candidates: list[tuple[Skill, list[str], int]] = [] for skill in skills: keywords = skill.config.intent.keywords - for keyword in keywords: - if keyword.lower() in combined_text: - return RoutingResult( - matched_skill=skill.name, - method="keyword", - confidence=1.0, - ) + if not keywords: + continue + matched_kws = [kw for kw in keywords if kw.lower() in combined_text] + if matched_kws: + score = sum(len(kw) for kw in matched_kws) + candidates.append((skill, matched_kws, score)) - return None + if not candidates: + return None + + # 按得分降序排序,得分相同时按 skill 名称字母序稳定排序 + candidates.sort(key=lambda c: (-c[2], c[0].name)) + best_skill, best_kws, _best_score = candidates[0] + confidence = min(1.0, 0.5 + 0.1 * len(best_kws)) + + return RoutingResult( + matched_skill=best_skill.name, + method="keyword", + confidence=confidence, + ) async def _classify_with_llm( self, input_data: dict[str, Any], skills: list[Skill] @@ -107,9 +121,7 @@ class IntentRouter: 最佳匹配的 Skill。 """ if self._llm_gateway is None: - raise RuntimeError( - "Keyword matching failed and no LLM Gateway configured for fallback" - ) + raise RuntimeError("Keyword matching failed and no LLM Gateway configured for fallback") prompt = self._build_classification_prompt(input_data, skills) @@ -120,9 +132,7 @@ class IntentRouter: return self._parse_llm_response(response.content, skills) - def _build_classification_prompt( - self, input_data: dict[str, Any], skills: list[Skill] - ) -> str: + def _build_classification_prompt(self, input_data: dict[str, Any], skills: list[Skill]) -> str: """构建 LLM 分类 prompt""" skill_descriptions = [] for i, skill in enumerate(skills, 1): @@ -142,13 +152,11 @@ class IntentRouter: "\n" f"User input: {input_data}\n" "\n" - 'Respond in JSON format:\n' + "Respond in JSON format:\n" '{"skill": "skill_name", "confidence": 0.9}' ) - def _parse_llm_response( - self, content: str, skills: list[Skill] - ) -> RoutingResult: + def _parse_llm_response(self, content: str, skills: list[Skill]) -> RoutingResult: """解析 LLM 响应,提取 skill name 和 confidence""" valid_names = {s.name for s in skills} @@ -175,9 +183,7 @@ class IntentRouter: ) @staticmethod - def _extract_skill_name_from_text( - text: str, valid_names: set[str] - ) -> str: + def _extract_skill_name_from_text(text: str, valid_names: set[str]) -> str: """从文本中尝试提取有效的 Skill 名称""" text_lower = text.lower() for name in valid_names: diff --git a/tests/e2e/__init__.py b/tests/e2e/__init__.py new file mode 100644 index 0000000..e74f2b3 --- /dev/null +++ b/tests/e2e/__init__.py @@ -0,0 +1,11 @@ +"""E2E backtest suite for Fischer AgentKit. + +Split into two dimensions: + - Basic Functions: verify all features work correctly (CLI, API, WebSocket, lifecycle) + - Agent Capabilities: verify intelligence level (routing, reasoning, collaboration) + +Uses subprocess to simulate real CLI operations (OpenCLI pattern), +httpx for API calls, and websockets for WS chat. +""" + +from tests.e2e.conftest import * # noqa: F401,F403 diff --git a/tests/e2e/benchmark_dataset.py b/tests/e2e/benchmark_dataset.py new file mode 100644 index 0000000..b1850e2 --- /dev/null +++ b/tests/e2e/benchmark_dataset.py @@ -0,0 +1,830 @@ +"""Agent Capability Benchmark — Ground Truth Dataset (v2). + +Aligned with actual skills in configs/skills/*.yaml. +Contains both manually curated edge cases and auto-generated cases. + +Categories: + - routing: intent routing correctness + - execution: execution mode selection accuracy + - team: expert team collaboration + - consistency: deterministic output consistency + - semantic_router: semantic similarity matching + - alignment: constraint compliance and cascade detection +""" + +from pydantic import BaseModel, ConfigDict + + +class BenchmarkCase(BaseModel): + """A single benchmark test case with ground truth label.""" + + model_config = ConfigDict(frozen=True) + + id: str + input: str + expected_skill: str | None = None + expected_execution_mode: str = "direct" + expected_complexity: str = "low" + category: str + subcategory: str + paraphrases: list[str] = [] + tags: list[str] = [] + + +# ═══════════════════════════════════════════════════════════════════════════ +# Routing — Keyword Match (aligned with actual skills) +# ═══════════════════════════════════════════════════════════════════════════ + +ROUTING_KEYWORD_BENCHMARKS: list[BenchmarkCase] = [ + # direct_agent + BenchmarkCase( + id="route-kw-direct-001", + input="翻译这段话", + expected_skill="direct_agent", + expected_execution_mode="direct", + expected_complexity="low", + category="routing", + subcategory="keyword_match", + paraphrases=["帮我翻译一下", "请翻译这段内容", "Translate this text"], + tags=["翻译", "translate"], + ), + BenchmarkCase( + id="route-kw-direct-002", + input="帮我总结一下", + expected_skill="direct_agent", + expected_execution_mode="direct", + expected_complexity="low", + category="routing", + subcategory="keyword_match", + paraphrases=["请总结", "给我一个摘要", "Summarize this"], + tags=["摘要", "summarize"], + ), + BenchmarkCase( + id="route-kw-direct-003", + input="什么是RAG?", + expected_skill="direct_agent", + expected_execution_mode="direct", + expected_complexity="low", + category="routing", + subcategory="keyword_match", + paraphrases=["RAG是什么", "解释一下RAG", "What is RAG?"], + tags=["什么是"], + ), + # react_agent + BenchmarkCase( + id="route-kw-react-001", + input="搜索一下AI Agent市场数据", + expected_skill="react_agent", + expected_execution_mode="react", + expected_complexity="high", + category="routing", + subcategory="keyword_match", + paraphrases=[ + "帮我搜索AI Agent市场信息", + "查找AI Agent的市场数据", + "Search AI Agent market data", + ], + tags=["搜索", "search"], + ), + BenchmarkCase( + id="route-kw-react-002", + input="帮我分析这个数据", + expected_skill="react_agent", + expected_execution_mode="react", + expected_complexity="high", + category="routing", + subcategory="keyword_match", + paraphrases=["分析一下这些数据", "请对数据做分析", "Analyze this data"], + tags=["分析", "analyze"], + ), + BenchmarkCase( + id="route-kw-react-003", + input="实时监控竞品动态", + expected_skill="react_agent", + expected_execution_mode="react", + expected_complexity="high", + category="routing", + subcategory="keyword_match", + paraphrases=["监控竞争对手的动态", "实时追踪竞品变化", "Monitor competitor activities"], + tags=["实时", "监控"], + ), + # rewoo_agent + BenchmarkCase( + id="route-kw-rewoo-001", + input="采集A、B、C三个竞品的功能数据", + expected_skill="rewoo_agent", + expected_execution_mode="rewoo", + expected_complexity="high", + category="routing", + subcategory="keyword_match", + paraphrases=[ + "批量采集竞品数据", + "并行获取多个竞品信息", + "Fetch data from multiple competitors", + ], + tags=["采集", "批量", "fetch"], + ), + BenchmarkCase( + id="route-kw-rewoo-002", + input="并行搜索多个关键词", + expected_skill="rewoo_agent", + expected_execution_mode="rewoo", + expected_complexity="high", + category="routing", + subcategory="keyword_match", + paraphrases=["同时搜索多个关键词", "批量搜索", "Search multiple keywords in parallel"], + tags=["并行", "批量"], + ), + # reflexion_agent + BenchmarkCase( + id="route-kw-reflex-001", + input="审查这段代码的合规性", + expected_skill="reflexion_agent", + expected_execution_mode="reflexion", + expected_complexity="high", + category="routing", + subcategory="keyword_match", + paraphrases=["检查代码是否合规", "审查代码合规问题", "Review code compliance"], + tags=["审查", "合规", "review"], + ), + BenchmarkCase( + id="route-kw-reflex-002", + input="生成一个高精度的数据分析脚本", + expected_skill="reflexion_agent", + expected_execution_mode="reflexion", + expected_complexity="high", + category="routing", + subcategory="keyword_match", + paraphrases=[ + "写一个精确的数据分析脚本", + "生成高精度分析代码", + "Generate a precise analysis script", + ], + tags=["代码生成", "精确", "code"], + ), + # plan_exec_agent + BenchmarkCase( + id="route-kw-planexec-001", + input="生成一份市场分析报告", + expected_skill="plan_exec_agent", + expected_execution_mode="plan_exec", + expected_complexity="high", + category="routing", + subcategory="keyword_match", + paraphrases=["做一份市场分析报告", "写个市场分析报告", "Generate a market analysis report"], + tags=["报告", "分析报告"], + ), + BenchmarkCase( + id="route-kw-planexec-002", + input="规划产品优化方案", + expected_skill="plan_exec_agent", + expected_execution_mode="plan_exec", + expected_complexity="high", + category="routing", + subcategory="keyword_match", + paraphrases=["制定产品优化计划", "帮我规划产品优化", "Plan product optimization"], + tags=["规划", "plan"], + ), + # code_reviewer + BenchmarkCase( + id="route-kw-coderev-001", + input="Review this code for quality", + expected_skill="code_reviewer", + expected_execution_mode="direct", + expected_complexity="low", + category="routing", + subcategory="keyword_match", + paraphrases=["审查这段代码的质量", "代码审查", "Check code quality"], + tags=["review", "代码审查"], + ), + # geo_optimizer + BenchmarkCase( + id="route-kw-geo-001", + input="帮我优化这篇文章的SEO", + expected_skill="geo_optimizer", + expected_execution_mode="llm_generate", + expected_complexity="low", + category="routing", + subcategory="keyword_match", + paraphrases=["SEO优化一下", "提升文章搜索排名", "Optimize this article for SEO"], + tags=["SEO优化", "optimize"], + ), + # deai_agent + BenchmarkCase( + id="route-kw-deai-001", + input="帮我把这篇文章去AI化", + expected_skill="deai_agent", + expected_execution_mode="llm_generate", + expected_complexity="low", + category="routing", + subcategory="keyword_match", + paraphrases=["让这段文字更自然", "改写得像人写的", "Make this text more natural"], + tags=["去AI化", "人性化"], + ), + # content_generator + BenchmarkCase( + id="route-kw-content-001", + input="帮我写一篇关于AI的文章", + expected_skill="content_generator", + expected_execution_mode="llm_generate", + expected_complexity="low", + category="routing", + subcategory="keyword_match", + paraphrases=["写一篇AI相关文章", "生成关于AI的内容", "Write an article about AI"], + tags=["写文章", "generate"], + ), + # citation_detector + BenchmarkCase( + id="route-kw-citation-001", + input="检测我们的品牌在AI平台的引用情况", + expected_skill="citation_detector", + expected_execution_mode="custom", + expected_complexity="medium", + category="routing", + subcategory="keyword_match", + paraphrases=[ + "分析品牌引用率", + "哪些AI平台引用了我们", + "Check brand citation on AI platforms", + ], + tags=["引用检测", "citation"], + ), + # trend_agent + BenchmarkCase( + id="route-kw-trend-001", + input="分析品牌趋势", + expected_skill="trend_agent", + expected_execution_mode="tool_call", + expected_complexity="medium", + category="routing", + subcategory="keyword_match", + paraphrases=["最近的热点话题是什么", "趋势洞察", "Analyze brand trends"], + tags=["趋势", "trend"], + ), + # competitor_analyzer + BenchmarkCase( + id="route-kw-competitor-001", + input="分析我的竞品策略", + expected_skill="competitor_analyzer", + expected_execution_mode="tool_call", + expected_complexity="medium", + category="routing", + subcategory="keyword_match", + paraphrases=["对比我和竞品的差距", "竞品分析", "Analyze competitor strategies"], + tags=["竞品", "competitor"], + ), + # schema_advisor + BenchmarkCase( + id="route-kw-schema-001", + input="帮我优化Schema", + expected_skill="schema_advisor", + expected_execution_mode="custom", + expected_complexity="medium", + category="routing", + subcategory="keyword_match", + paraphrases=["生成JSON-LD结构化数据", "Schema有什么可以改进的", "Optimize my Schema"], + tags=["Schema", "schema优化"], + ), + # monitor + BenchmarkCase( + id="route-kw-monitor-001", + input="监测品牌引用变化", + expected_skill="monitor", + expected_execution_mode="custom", + expected_complexity="medium", + category="routing", + subcategory="keyword_match", + paraphrases=["追踪效果", "品牌排名变化", "Monitor brand citation changes"], + tags=["监测", "monitor"], + ), + # goal_driven_agent + BenchmarkCase( + id="route-kw-goal-001", + input="分析竞品SEO策略并生成优化方案", + expected_skill="goal_driven_agent", + expected_execution_mode="tool_call", + expected_complexity="medium", + category="routing", + subcategory="keyword_match", + paraphrases=[ + "调研技术方案并生成对比报告", + "制定市场推广计划", + "Analyze SEO and generate plan", + ], + tags=["分析", "优化方案"], + ), +] + + +# ═══════════════════════════════════════════════════════════════════════════ +# Routing — Edge Cases (manually curated) +# ═══════════════════════════════════════════════════════════════════════════ + +ROUTING_EDGE_BENCHMARKS: list[BenchmarkCase] = [ + # Greeting (should NOT route to any skill) + BenchmarkCase( + id="route-edge-greet-001", + input="你好", + expected_skill=None, + expected_execution_mode="direct", + expected_complexity="low", + category="routing", + subcategory="greeting", + paraphrases=["Hello", "Hi there", "早上好"], + tags=["greeting"], + ), + BenchmarkCase( + id="route-edge-greet-002", + input="Good morning!", + expected_skill=None, + expected_execution_mode="direct", + expected_complexity="low", + category="routing", + subcategory="greeting", + paraphrases=["早上好!", "你好呀"], + tags=["greeting"], + ), + # Identity (should NOT route to any skill) + BenchmarkCase( + id="route-edge-identity-001", + input="你是谁?", + expected_skill=None, + expected_execution_mode="direct", + expected_complexity="low", + category="routing", + subcategory="identity", + paraphrases=["What is your name?", "介绍一下你自己", "Tell me about yourself"], + tags=["identity"], + ), + # Explicit prefix + BenchmarkCase( + id="route-edge-explicit-001", + input="@skill:react_agent 搜索最新的AI新闻", + expected_skill="react_agent", + expected_execution_mode="react", + expected_complexity="high", + category="routing", + subcategory="explicit_prefix", + paraphrases=["@skill:react_agent 查找AI最新动态"], + tags=["explicit", "react"], + ), + # Fallback (no matching skill) + BenchmarkCase( + id="route-edge-fallback-001", + input="告诉我一个笑话", + expected_skill=None, + expected_execution_mode="direct", + expected_complexity="low", + category="routing", + subcategory="fallback", + paraphrases=["讲个笑话", "Tell me a joke", "说个搞笑的"], + tags=["fallback"], + ), + BenchmarkCase( + id="route-edge-fallback-002", + input="What is quantum physics?", + expected_skill=None, + expected_execution_mode="direct", + expected_complexity="low", + category="routing", + subcategory="fallback", + paraphrases=["量子物理是什么", "Explain quantum mechanics"], + tags=["fallback"], + ), + # Disambiguation (multiple skills could match) + BenchmarkCase( + id="route-edge-disambig-001", + input="审查代码并优化SEO", + expected_skill="code_reviewer", + expected_execution_mode="direct", + expected_complexity="low", + category="routing", + subcategory="disambiguation", + paraphrases=["Review code and optimize SEO", "代码审查加SEO优化"], + tags=["disambiguation", "review", "seo"], + ), +] + + +# ═══════════════════════════════════════════════════════════════════════════ +# Execution Mode Benchmarks +# ═══════════════════════════════════════════════════════════════════════════ + +EXECUTION_BENCHMARKS: list[BenchmarkCase] = [ + BenchmarkCase( + id="exec-direct-001", + input="翻译这段话成英文", + expected_skill="direct_agent", + expected_execution_mode="direct", + expected_complexity="low", + category="execution", + subcategory="direct_mode", + paraphrases=["Translate this to English", "把这段翻成英语"], + tags=["direct", "simple"], + ), + BenchmarkCase( + id="exec-direct-002", + input="什么是AgentKit?", + expected_skill="direct_agent", + expected_execution_mode="direct", + expected_complexity="low", + category="execution", + subcategory="direct_mode", + paraphrases=["AgentKit是什么", "Explain AgentKit"], + tags=["direct", "qa"], + ), + BenchmarkCase( + id="exec-react-001", + input="搜索并分析AI行业最新趋势", + expected_skill="react_agent", + expected_execution_mode="react", + expected_complexity="high", + category="execution", + subcategory="react_mode", + paraphrases=["Search and analyze AI trends", "调研AI行业趋势"], + tags=["react", "multi_step"], + ), + BenchmarkCase( + id="exec-react-002", + input="实时监控竞品动态并生成报告", + expected_skill="react_agent", + expected_execution_mode="react", + expected_complexity="high", + category="execution", + subcategory="react_mode", + paraphrases=["Monitor competitors and report", "追踪竞品并输出报告"], + tags=["react", "monitoring"], + ), + BenchmarkCase( + id="exec-rewoo-001", + input="批量采集多个竞品的功能数据", + expected_skill="rewoo_agent", + expected_execution_mode="rewoo", + expected_complexity="high", + category="execution", + subcategory="rewoo_mode", + paraphrases=["并行获取竞品数据", "Fetch competitor data in parallel"], + tags=["rewoo", "parallel"], + ), + BenchmarkCase( + id="exec-reflexion-001", + input="审查代码合规性并确保高精度", + expected_skill="reflexion_agent", + expected_execution_mode="reflexion", + expected_complexity="high", + category="execution", + subcategory="reflexion_mode", + paraphrases=["高精度代码审查", "Precise code compliance review"], + tags=["reflexion", "precision"], + ), + BenchmarkCase( + id="exec-planexec-001", + input="生成一份完整的市场调研报告", + expected_skill="plan_exec_agent", + expected_execution_mode="plan_exec", + expected_complexity="high", + category="execution", + subcategory="plan_exec_mode", + paraphrases=["做一份市场调研报告", "Generate a market research report"], + tags=["plan_exec", "report"], + ), + BenchmarkCase( + id="exec-quality-001", + input="生成内容并确保质量达标", + expected_skill="content_generator", + expected_execution_mode="llm_generate", + expected_complexity="low", + category="execution", + subcategory="quality_gate", + paraphrases=["生成高质量内容", "Generate quality content"], + tags=["quality", "content"], + ), +] + + +# ═══════════════════════════════════════════════════════════════════════════ +# Team Collaboration Benchmarks +# ═══════════════════════════════════════════════════════════════════════════ + +TEAM_BENCHMARKS: list[BenchmarkCase] = [ + BenchmarkCase( + id="team-explicit-001", + input="@team:react_agent,plan_exec_agent 协作完成深度分析并生成报告", + expected_execution_mode="react", + expected_complexity="high", + category="team", + subcategory="explicit_team", + paraphrases=[ + "需要react_agent和plan_exec_agent协作", + "组建团队:搜索分析+报告生成", + ], + tags=["team", "explicit"], + ), + BenchmarkCase( + id="team-explicit-002", + input="@team:competitor_analyzer,trend_agent 分析竞品并追踪趋势", + expected_execution_mode="react", + expected_complexity="high", + category="team", + subcategory="explicit_team", + paraphrases=["竞品分析+趋势追踪团队", "Team for competitor and trend analysis"], + tags=["team", "explicit"], + ), + BenchmarkCase( + id="team-complexity-001", + input="深度分析竞品策略、追踪品牌趋势并生成优化方案", + expected_execution_mode="react", + expected_complexity="high", + category="team", + subcategory="complexity_trigger", + paraphrases=[ + "全面竞品分析和优化方案", + "Comprehensive competitor analysis with optimization", + ], + tags=["team", "complexity"], + ), + BenchmarkCase( + id="team-fallback-001", + input="复杂任务但无匹配专家", + expected_execution_mode="react", + expected_complexity="high", + category="team", + subcategory="fallback", + paraphrases=["需要团队但找不到合适专家", "Complex task without matching experts"], + tags=["team", "fallback"], + ), + BenchmarkCase( + id="team-name-valid-001", + input="@team:react_agent,plan_exec_agent", + expected_execution_mode="react", + expected_complexity="high", + category="team", + subcategory="name_validation", + tags=["team", "validation"], + ), + BenchmarkCase( + id="team-name-invalid-001", + input="@team:invalid expert name", + expected_execution_mode="direct", + expected_complexity="low", + category="team", + subcategory="name_validation", + tags=["team", "validation", "invalid"], + ), +] + + +# ═══════════════════════════════════════════════════════════════════════════ +# Consistency Benchmarks +# ═══════════════════════════════════════════════════════════════════════════ + +CONSISTENCY_BENCHMARKS: list[BenchmarkCase] = [ + BenchmarkCase( + id="consist-direct-001", + input="翻译'hello world'成中文", + expected_skill="direct_agent", + expected_execution_mode="direct", + expected_complexity="low", + category="consistency", + subcategory="deterministic", + tags=["consistency", "translation"], + ), + BenchmarkCase( + id="consist-direct-002", + input="什么是RAG?", + expected_skill="direct_agent", + expected_execution_mode="direct", + expected_complexity="low", + category="consistency", + subcategory="deterministic", + tags=["consistency", "qa"], + ), + BenchmarkCase( + id="consist-react-001", + input="搜索AI Agent市场数据", + expected_skill="react_agent", + expected_execution_mode="react", + expected_complexity="high", + category="consistency", + subcategory="deterministic", + tags=["consistency", "search"], + ), + BenchmarkCase( + id="consist-geo-001", + input="帮我优化这篇文章的SEO", + expected_skill="geo_optimizer", + expected_execution_mode="llm_generate", + expected_complexity="low", + category="consistency", + subcategory="deterministic", + tags=["consistency", "seo"], + ), + BenchmarkCase( + id="consist-deai-001", + input="帮我把这篇文章去AI化", + expected_skill="deai_agent", + expected_execution_mode="llm_generate", + expected_complexity="low", + category="consistency", + subcategory="deterministic", + tags=["consistency", "deai"], + ), +] + + +# ═══════════════════════════════════════════════════════════════════════════ +# Semantic Router Benchmarks +# ═══════════════════════════════════════════════════════════════════════════ + +SEMANTIC_ROUTER_BENCHMARKS: list[BenchmarkCase] = [ + BenchmarkCase( + id="semantic-direct-001", + input="简单生成任务,无需工具调用", + expected_skill="direct_agent", + expected_execution_mode="direct", + expected_complexity="low", + category="semantic_router", + subcategory="description_match", + paraphrases=["只需要一次生成的简单任务", "Single LLM call task"], + tags=["semantic", "direct"], + ), + BenchmarkCase( + id="semantic-react-001", + input="需要动态适应、逐步推理和工具调用", + expected_skill="react_agent", + expected_execution_mode="react", + expected_complexity="high", + category="semantic_router", + subcategory="description_match", + paraphrases=["需要多步推理和工具", "Multi-step reasoning with tools"], + tags=["semantic", "react"], + ), + BenchmarkCase( + id="semantic-rewoo-001", + input="多源数据并行采集、无依赖工具调用批量执行", + expected_skill="rewoo_agent", + expected_execution_mode="rewoo", + expected_complexity="high", + category="semantic_router", + subcategory="description_match", + paraphrases=["并行批量获取数据", "Parallel data collection"], + tags=["semantic", "rewoo"], + ), + BenchmarkCase( + id="semantic-reflex-001", + input="需要高精度和自我验证的任务", + expected_skill="reflexion_agent", + expected_execution_mode="reflexion", + expected_complexity="high", + category="semantic_router", + subcategory="description_match", + paraphrases=["需要自我检查的高精度任务", "High-precision self-verification task"], + tags=["semantic", "reflexion"], + ), + BenchmarkCase( + id="semantic-planexec-001", + input="结构化多步骤任务,需要可审查的规划和执行", + expected_skill="plan_exec_agent", + expected_execution_mode="plan_exec", + expected_complexity="high", + category="semantic_router", + subcategory="description_match", + paraphrases=["需要先规划再执行的任务", "Structured planning and execution"], + tags=["semantic", "plan_exec"], + ), + BenchmarkCase( + id="semantic-geo-001", + input="对文章进行GEO/SEO优化,提升在AI搜索引擎中的可见性", + expected_skill="geo_optimizer", + expected_execution_mode="llm_generate", + expected_complexity="low", + category="semantic_router", + subcategory="description_match", + paraphrases=["提升内容搜索排名", "Improve content visibility in AI search"], + tags=["semantic", "geo"], + ), + BenchmarkCase( + id="semantic-citation-001", + input="检测品牌在各AI平台回答中的引用情况", + expected_skill="citation_detector", + expected_execution_mode="custom", + expected_complexity="medium", + category="semantic_router", + subcategory="description_match", + paraphrases=["分析品牌被AI引用的情况", "Check brand citation across AI platforms"], + tags=["semantic", "citation"], + ), + BenchmarkCase( + id="semantic-competitor-001", + input="分析竞品策略、对比品牌差距或发现竞争机会", + expected_skill="competitor_analyzer", + expected_execution_mode="tool_call", + expected_complexity="medium", + category="semantic_router", + subcategory="description_match", + paraphrases=["竞品对比和差距分析", "Competitive gap analysis"], + tags=["semantic", "competitor"], + ), +] + + +# ═══════════════════════════════════════════════════════════════════════════ +# Alignment Guard Benchmarks +# ═══════════════════════════════════════════════════════════════════════════ + +ALIGNMENT_BENCHMARKS: list[BenchmarkCase] = [ + BenchmarkCase( + id="align-negative-001", + input="写一篇产品介绍,不要提及价格", + expected_skill="content_generator", + expected_execution_mode="llm_generate", + expected_complexity="low", + category="alignment", + subcategory="negative_constraint", + tags=["alignment", "negative_constraint"], + ), + BenchmarkCase( + id="align-positive-001", + input="生成报告,必须包含摘要部分", + expected_skill="plan_exec_agent", + expected_execution_mode="plan_exec", + expected_complexity="high", + category="alignment", + subcategory="positive_constraint", + tags=["alignment", "positive_constraint"], + ), + BenchmarkCase( + id="align-cascade-001", + input="反复搜索相同关键词", + expected_skill="react_agent", + expected_execution_mode="react", + expected_complexity="high", + category="alignment", + subcategory="cascade_detection", + tags=["alignment", "cascade"], + ), + BenchmarkCase( + id="align-no-constraint-001", + input="帮我写一篇文章", + expected_skill="content_generator", + expected_execution_mode="llm_generate", + expected_complexity="low", + category="alignment", + subcategory="no_constraint", + tags=["alignment", "baseline"], + ), + BenchmarkCase( + id="align-combined-001", + input="生成竞品分析报告,必须包含对比表格,不要提及内部数据", + expected_skill="competitor_analyzer", + expected_execution_mode="tool_call", + expected_complexity="medium", + category="alignment", + subcategory="combined_constraint", + tags=["alignment", "combined"], + ), +] + + +# ═══════════════════════════════════════════════════════════════════════════ +# All benchmarks combined +# ═══════════════════════════════════════════════════════════════════════════ + +ALL_BENCHMARKS: list[BenchmarkCase] = ( + ROUTING_KEYWORD_BENCHMARKS + + ROUTING_EDGE_BENCHMARKS + + EXECUTION_BENCHMARKS + + TEAM_BENCHMARKS + + CONSISTENCY_BENCHMARKS + + SEMANTIC_ROUTER_BENCHMARKS + + ALIGNMENT_BENCHMARKS +) + + +def get_benchmarks_by_category(category: str) -> list[BenchmarkCase]: + """Filter benchmarks by category.""" + return [b for b in ALL_BENCHMARKS if b.category == category] + + +def get_benchmarks_by_subcategory(subcategory: str) -> list[BenchmarkCase]: + """Filter benchmarks by subcategory.""" + return [b for b in ALL_BENCHMARKS if b.subcategory == subcategory] + + +def get_benchmarks_with_paraphrases() -> list[BenchmarkCase]: + """Get only benchmarks that have paraphrases (for overfitting detection).""" + return [b for b in ALL_BENCHMARKS if b.paraphrases] + + +def get_skill_names_needed() -> set[str]: + """Get all skill names referenced in benchmarks (for pre-registration).""" + return {b.expected_skill for b in ALL_BENCHMARKS if b.expected_skill is not None} + + +def get_benchmark_stats() -> dict[str, int]: + """Get benchmark count by category.""" + stats: dict[str, int] = {} + for b in ALL_BENCHMARKS: + stats[b.category] = stats.get(b.category, 0) + 1 + stats["total"] = len(ALL_BENCHMARKS) + return stats diff --git a/tests/e2e/benchmark_generator.py b/tests/e2e/benchmark_generator.py new file mode 100644 index 0000000..a85ca04 --- /dev/null +++ b/tests/e2e/benchmark_generator.py @@ -0,0 +1,339 @@ +"""Benchmark Generator — Auto-generate benchmark cases from skill configs. + +Reads configs/skills/*.yaml, extracts intent.keywords/description/examples, +and generates BenchmarkCase objects aligned with actual skill configurations. + +This ensures the benchmark dataset stays in sync with the real skill registry. +""" + +from pathlib import Path + +import yaml +from pydantic import BaseModel, ConfigDict + +from tests.e2e.benchmark_dataset import BenchmarkCase + + +# ═══════════════════════════════════════════════════════════════════════════ +# Skill Config Model +# ═══════════════════════════════════════════════════════════════════════════ + + +class SkillIntent(BaseModel): + """Intent section of a skill config.""" + + model_config = ConfigDict(extra="ignore") + + keywords: list[str] = [] + description: str = "" + examples: list[str] = [] + + +class SkillConfig(BaseModel): + """Minimal skill config model for benchmark generation.""" + + model_config = ConfigDict(extra="ignore") + + name: str + description: str = "" + execution_mode: str = "direct" + task_mode: str = "llm_generate" + intent: SkillIntent = SkillIntent() + + +# ═══════════════════════════════════════════════════════════════════════════ +# Complexity Mapping +# ═══════════════════════════════════════════════════════════════════════════ + +EXECUTION_MODE_TO_COMPLEXITY: dict[str, str] = { + "direct": "low", + "react": "high", + "rewoo": "high", + "reflexion": "high", + "plan_exec": "high", + "tool_call": "medium", + "llm_generate": "low", + "custom": "medium", +} + +# Paraphrase templates for auto-generating paraphrases from examples +PARAPHRASE_TEMPLATES_CN: list[str] = [ + "请帮我{action}", + "我需要{action}", + "能不能{action}", +] + +PARAPHRASE_TEMPLATES_EN: list[str] = [ + "Please help me {action}", + "I need to {action}", + "Can you {action}", +] + + +# ═══════════════════════════════════════════════════════════════════════════ +# Benchmark Generator +# ═══════════════════════════════════════════════════════════════════════════ + + +class BenchmarkGenerator: + """Generate benchmark cases from skill config YAML files.""" + + def __init__(self, configs_dir: str | None = None) -> None: + if configs_dir is None: + # Default: project_root/configs/skills/ + project_root = Path(__file__).parent.parent.parent.parent + configs_dir = str(project_root / "configs" / "skills") + self.configs_dir = configs_dir + self._skills: list[SkillConfig] = [] + self._loaded = False + + def load_skills(self) -> list[SkillConfig]: + """Load all skill configs from YAML files.""" + if self._loaded: + return self._skills + + skills_dir = Path(self.configs_dir) + if not skills_dir.exists(): + return self._skills + + for yaml_file in sorted(skills_dir.glob("*.yaml")): + with open(yaml_file, encoding="utf-8") as f: + data = yaml.safe_load(f) + if data and isinstance(data, dict): + try: + skill = SkillConfig(**data) + self._skills.append(skill) + except Exception: + continue + + self._loaded = True + return self._skills + + def _get_effective_execution_mode(self, skill: SkillConfig) -> str: + """Get the effective execution mode for a skill.""" + if skill.execution_mode and skill.execution_mode != "direct": + return skill.execution_mode + # Map task_mode to execution mode + return skill.task_mode if skill.task_mode else "direct" + + def _generate_paraphrases(self, example: str, keywords: list[str]) -> list[str]: + """Generate paraphrases for an example query.""" + paraphrases: list[str] = [] + + # Simple paraphrase generation: add prefix variations + is_chinese = any("\u4e00" <= c <= "\u9fff" for c in example) + + if is_chinese: + # Chinese paraphrases + if not example.startswith("请") and not example.startswith("帮"): + paraphrases.append(f"请{example}") + if not example.startswith("我"): + paraphrases.append(f"我需要{example}") + # Add keyword-based variant + if keywords: + kw = keywords[0] + if kw not in example: + paraphrases.append(f"关于{kw},{example}") + else: + # English paraphrases + lower = example.lower() + if not lower.startswith("please") and not lower.startswith("can you"): + paraphrases.append(f"Please {example[0].lower()}{example[1:]}") + if not lower.startswith("i need"): + paraphrases.append(f"I need to {example[0].lower()}{example[1:]}") + + return paraphrases[:3] # Max 3 paraphrases per example + + def generate_routing_benchmarks(self) -> list[BenchmarkCase]: + """Generate routing benchmark cases from all skills.""" + skills = self.load_skills() + cases: list[BenchmarkCase] = [] + case_counter = 0 + + for skill in skills: + exec_mode = self._get_effective_execution_mode(skill) + complexity = EXECUTION_MODE_TO_COMPLEXITY.get(exec_mode, "low") + + # Generate from intent.examples + for example in skill.intent.examples: + case_counter += 1 + paraphrases = self._generate_paraphrases(example, skill.intent.keywords) + cases.append( + BenchmarkCase( + id=f"route-auto-{case_counter:03d}", + input=example, + expected_skill=skill.name, + expected_execution_mode=exec_mode, + expected_complexity=complexity, + category="routing", + subcategory="keyword_match", + paraphrases=paraphrases, + tags=skill.intent.keywords[:3], + ) + ) + + # Generate from intent.keywords (one case per keyword) + for keyword in skill.intent.keywords: + case_counter += 1 + query = ( + f"帮我{keyword}" + if any("\u4e00" <= c <= "\u9fff" for c in keyword) + else f"Help me {keyword}" + ) + cases.append( + BenchmarkCase( + id=f"route-kw-auto-{case_counter:03d}", + input=query, + expected_skill=skill.name, + expected_execution_mode=exec_mode, + expected_complexity=complexity, + category="routing", + subcategory="keyword_match", + tags=[keyword], + ) + ) + + return cases + + def generate_execution_benchmarks(self) -> list[BenchmarkCase]: + """Generate execution mode benchmark cases.""" + skills = self.load_skills() + cases: list[BenchmarkCase] = [] + case_counter = 0 + + # Group skills by execution mode + mode_groups: dict[str, list[SkillConfig]] = {} + for skill in skills: + mode = self._get_effective_execution_mode(skill) + mode_groups.setdefault(mode, []).append(skill) + + for mode, group in mode_groups.items(): + complexity = EXECUTION_MODE_TO_COMPLEXITY.get(mode, "low") + for skill in group[:2]: # Max 2 skills per mode + if skill.intent.examples: + case_counter += 1 + cases.append( + BenchmarkCase( + id=f"exec-auto-{case_counter:03d}", + input=skill.intent.examples[0], + expected_skill=skill.name, + expected_execution_mode=mode, + expected_complexity=complexity, + category="execution", + subcategory=f"{mode}_mode", + paraphrases=skill.intent.examples[1:2], + tags=[mode], + ) + ) + + return cases + + def generate_team_benchmarks(self) -> list[BenchmarkCase]: + """Generate team collaboration benchmark cases.""" + skills = self.load_skills() + cases: list[BenchmarkCase] = [] + case_counter = 0 + + # High-complexity skills suitable for team collaboration + high_complexity_skills = [ + s + for s in skills + if EXECUTION_MODE_TO_COMPLEXITY.get(self._get_effective_execution_mode(s), "low") + == "high" + ] + + if len(high_complexity_skills) >= 2: + skill_a, skill_b = high_complexity_skills[0], high_complexity_skills[1] + case_counter += 1 + cases.append( + BenchmarkCase( + id=f"team-auto-{case_counter:03d}", + input=f"@team:{skill_a.name},{skill_b.name} 协作完成复杂分析任务", + expected_execution_mode="react", + expected_complexity="high", + category="team", + subcategory="explicit_team", + paraphrases=[ + f"需要{skill_a.name}和{skill_b.name}协作分析", + f"组建团队:{skill_a.name} + {skill_b.name}", + ], + tags=["team", skill_a.name, skill_b.name], + ) + ) + + # Complexity-triggered team + if high_complexity_skills: + skill = high_complexity_skills[0] + case_counter += 1 + cases.append( + BenchmarkCase( + id=f"team-complexity-{case_counter:03d}", + input=f"深度{skill.intent.keywords[0] if skill.intent.keywords else '分析'}并生成详细报告", + expected_execution_mode="react", + expected_complexity="high", + category="team", + subcategory="complexity_trigger", + paraphrases=[ + f"全面{skill.intent.keywords[0] if skill.intent.keywords else '分析'}并输出报告", + ], + tags=["team", "complexity"], + ) + ) + + return cases + + def generate_semantic_benchmarks(self) -> list[BenchmarkCase]: + """Generate semantic router specific benchmark cases.""" + skills = self.load_skills() + cases: list[BenchmarkCase] = [] + case_counter = 0 + + for skill in skills: + if not skill.intent.description: + continue + case_counter += 1 + # Use description as input (tests semantic matching, not keyword matching) + cases.append( + BenchmarkCase( + id=f"semantic-auto-{case_counter:03d}", + input=skill.intent.description, + expected_skill=skill.name, + expected_execution_mode=self._get_effective_execution_mode(skill), + expected_complexity=EXECUTION_MODE_TO_COMPLEXITY.get( + self._get_effective_execution_mode(skill), "low" + ), + category="semantic_router", + subcategory="description_match", + tags=["semantic", skill.name], + ) + ) + + return cases + + def generate_all(self) -> list[BenchmarkCase]: + """Generate all auto-generated benchmark cases.""" + cases: list[BenchmarkCase] = [] + cases.extend(self.generate_routing_benchmarks()) + cases.extend(self.generate_execution_benchmarks()) + cases.extend(self.generate_team_benchmarks()) + cases.extend(self.generate_semantic_benchmarks()) + return cases + + def get_skill_names(self) -> set[str]: + """Get all skill names from configs.""" + return {s.name for s in self.load_skills()} + + +# ═══════════════════════════════════════════════════════════════════════════ +# Singleton for reuse +# ═══════════════════════════════════════════════════════════════════════════ + +_generator: BenchmarkGenerator | None = None + + +def get_generator() -> BenchmarkGenerator: + """Get or create the singleton BenchmarkGenerator.""" + global _generator + if _generator is None: + _generator = BenchmarkGenerator() + return _generator diff --git a/tests/e2e/capability_metrics.py b/tests/e2e/capability_metrics.py new file mode 100644 index 0000000..d908926 --- /dev/null +++ b/tests/e2e/capability_metrics.py @@ -0,0 +1,1366 @@ +"""Agent Capability Metrics — Collection, Analysis, and Reporting. + +Core components: + 1. CapabilityMetrics: data model for a single test observation + 2. MetricsCollector: session-scoped collector that gathers all observations + 3. MetricsAnalyzer: computes recall/precision/F1, overfitting scores, weakness analysis + 4. MetricsReporter: generates human-readable and machine-readable reports + +Design: + - Collector is a pytest fixture (session-scoped), injected into capability tests + - Each test records what actually happened vs what was expected + - After all tests, analyzer computes aggregate metrics + - Reporter outputs JSON + plain-text summary +""" + +import json +import os +import time +from collections import defaultdict +from datetime import datetime, timezone +from typing import Any + +from pydantic import BaseModel, ConfigDict + +from tests.e2e.benchmark_dataset import BenchmarkCase + + +# ═══════════════════════════════════════════════════════════════════════════ +# 1. Data Models +# ═══════════════════════════════════════════════════════════════════════════ + + +class CapabilityObservation(BaseModel): + """A single test observation: what was expected vs what actually happened.""" + + model_config = ConfigDict() + + # Identity + benchmark_id: str + test_name: str + timestamp: str + + # Input + input_query: str + is_paraphrase: bool = False # True if this is a paraphrase test (overfitting detection) + + # Expected (ground truth) + expected_skill: str | None = None + expected_execution_mode: str = "direct" + expected_complexity: str = "low" + + # Actual (observed) + actual_skill: str | None = None + actual_execution_mode: str | None = None + actual_status_code: int = 0 + actual_response_keys: list[str] = [] + actual_complexity_score: float | None = None + actual_match_method: str | None = None + actual_match_confidence: float | None = None + + # Judgments + skill_correct: bool | None = None # None = couldn't determine + execution_mode_correct: bool | None = None + complexity_correct: bool | None = None + task_succeeded: bool = False # HTTP 200 + valid response + + # Metadata + category: str = "" + subcategory: str = "" + response_time_ms: float = 0.0 + error_message: str | None = None + + # Alignment & Cascade fields (U5) + alignment_violations: int = 0 # Number of constraint violations detected + cascade_alert: bool = False # Whether a cascade alert was triggered + + +class CategoryMetrics(BaseModel): + """Aggregate metrics for a specific category/subcategory.""" + + model_config = ConfigDict() + + category: str + subcategory: str + total: int = 0 + skill_correct: int = 0 + skill_recall: float = 0.0 + skill_precision: float = 0.0 + skill_f1: float = 0.0 + execution_mode_correct: int = 0 + execution_mode_accuracy: float = 0.0 + complexity_correct: int = 0 + complexity_accuracy: float = 0.0 + task_success_rate: float = 0.0 + avg_response_time_ms: float = 0.0 + + +class OverfittingResult(BaseModel): + """Overfitting detection result for a single benchmark case.""" + + model_config = ConfigDict() + + benchmark_id: str + original_correct: bool + paraphrase_results: list[bool] # True = correct for each paraphrase + consistency_rate: float = 0.0 # % of paraphrases that match original result + is_overfitted: bool = False # True if original correct but paraphrases mostly wrong + + +class WeaknessItem(BaseModel): + """A single identified weakness.""" + + model_config = ConfigDict() + + dimension: str # routing / execution / quality / team / consistency + subcategory: str + severity: str # critical / high / medium / low + description: str + evidence: str + suggestion: str + + +class RootCause(BaseModel): + """Root cause analysis for a weakness.""" + + model_config = ConfigDict() + + cause_type: str # keyword_gap / complexity_misjudge / intent_ambiguous / fallback_missing / overfit_pattern / tool_missing / config_error / quality_threshold + cause_description: str + confidence: float = 0.0 # 0.0~1.0, how confident we are about this root cause + affected_cases: list[str] = [] # benchmark IDs affected by this cause + detail: str = "" # additional technical detail + + +class ImprovementAction(BaseModel): + """A single actionable improvement step.""" + + model_config = ConfigDict() + + action_id: str + title: str + description: str + target_module: str # which code module to modify + priority: str # P0 / P1 / P2 / P3 + expected_impact: str # what improvement to expect + effort: str # small / medium / large + related_causes: list[str] = [] # cause_types this action addresses + verification: str = "" # how to verify the fix works + + +class ImprovementPlan(BaseModel): + """Improvement plan for a specific weakness.""" + + model_config = ConfigDict() + + weakness_description: str + root_causes: list[RootCause] + actions: list[ImprovementAction] + overall_strategy: str + + +class CapabilityReport(BaseModel): + """Full capability analysis report.""" + + model_config = ConfigDict() + + generated_at: str + total_observations: int + overall_skill_recall: float + overall_skill_precision: float + overall_skill_f1: float + overall_execution_mode_accuracy: float + overall_task_success_rate: float + category_metrics: list[CategoryMetrics] + overfitting_results: list[OverfittingResult] + overfitting_score: float # 0.0 = no overfitting, 1.0 = fully overfitted + weaknesses: list[WeaknessItem] + root_causes: list[RootCause] + improvement_plans: list[ImprovementPlan] + raw_observations: list[CapabilityObservation] + + +# ═══════════════════════════════════════════════════════════════════════════ +# 2. Metrics Collector +# ═══════════════════════════════════════════════════════════════════════════ + + +class MetricsCollector: + """Collects capability observations during E2E test execution. + + Usage in tests: + collector.record(observation) + collector.record_benchmark_result(benchmark, actual_skill, ...) + """ + + def __init__(self) -> None: + self._observations: list[CapabilityObservation] = [] + self._start_times: dict[str, float] = {} + + def start_timer(self, benchmark_id: str) -> None: + self._start_times[benchmark_id] = time.monotonic() + + def stop_timer(self, benchmark_id: str) -> float: + start = self._start_times.pop(benchmark_id, None) + if start is None: + return 0.0 + return (time.monotonic() - start) * 1000 # ms + + def record(self, observation: CapabilityObservation) -> None: + self._observations.append(observation) + + def record_benchmark_result( + self, + benchmark: BenchmarkCase, + *, + test_name: str, + actual_skill: str | None = None, + actual_execution_mode: str | None = None, + actual_status_code: int = 0, + actual_response_keys: list[str] | None = None, + task_succeeded: bool = False, + is_paraphrase: bool = False, + error_message: str | None = None, + ) -> CapabilityObservation: + """Record a benchmark test result with automatic correctness judgment.""" + response_time = self.stop_timer(benchmark.id) + + # Judge skill correctness + skill_correct: bool | None = None + if benchmark.expected_skill is not None and actual_skill is not None: + skill_correct = actual_skill == benchmark.expected_skill + elif benchmark.expected_skill is None: + # Expected no specific skill, so any non-error is acceptable + skill_correct = actual_skill is None or task_succeeded + + # Judge execution mode correctness + execution_mode_correct: bool | None = None + if actual_execution_mode is not None: + # Normalize both sides for comparison: + # actual: "skill_react" / "rewoo" / "direct_chat" etc. + # expected: "react" / "rewoo" / "direct" etc. + _MODE_EQUIVALENCE: dict[str, str] = { + "skill_react": "react", + "direct_chat": "direct", + "team_collab": "team_collab", + } + actual_norm = _MODE_EQUIVALENCE.get(actual_execution_mode, actual_execution_mode) + execution_mode_correct = actual_norm == benchmark.expected_execution_mode + + # Judge complexity correctness (approximate: based on execution mode match) + complexity_correct: bool | None = None + if execution_mode_correct is not None: + complexity_correct = execution_mode_correct + + obs = CapabilityObservation( + benchmark_id=benchmark.id, + test_name=test_name, + timestamp=datetime.now(timezone.utc).isoformat(), + input_query=benchmark.input, + is_paraphrase=is_paraphrase, + expected_skill=benchmark.expected_skill, + expected_execution_mode=benchmark.expected_execution_mode, + expected_complexity=benchmark.expected_complexity, + actual_skill=actual_skill, + actual_execution_mode=actual_execution_mode, + actual_status_code=actual_status_code, + actual_response_keys=actual_response_keys or [], + skill_correct=skill_correct, + execution_mode_correct=execution_mode_correct, + complexity_correct=complexity_correct, + task_succeeded=task_succeeded, + category=benchmark.category, + subcategory=benchmark.subcategory, + response_time_ms=response_time, + error_message=error_message, + ) + self._observations.append(obs) + return obs + + @property + def observations(self) -> list[CapabilityObservation]: + return self._observations + + def get_observations_by_category(self, category: str) -> list[CapabilityObservation]: + return [o for o in self._observations if o.category == category] + + def get_observations_by_subcategory(self, subcategory: str) -> list[CapabilityObservation]: + return [o for o in self._observations if o.subcategory == subcategory] + + def get_original_observations(self) -> list[CapabilityObservation]: + """Get non-paraphrase observations.""" + return [o for o in self._observations if not o.is_paraphrase] + + def get_paraphrase_observations(self) -> list[CapabilityObservation]: + """Get paraphrase observations only.""" + return [o for o in self._observations if o.is_paraphrase] + + +# ═══════════════════════════════════════════════════════════════════════════ +# 3. Metrics Analyzer +# ═══════════════════════════════════════════════════════════════════════════ + + +class MetricsAnalyzer: + """Analyzes collected metrics to compute recall/precision/F1, overfitting, weaknesses.""" + + @staticmethod + def _safe_div(numerator: float, denominator: float) -> float: + return numerator / denominator if denominator > 0 else 0.0 + + @staticmethod + def compute_prf(tp: int, fp: int, fn: int) -> tuple[float, float, float]: + """Compute precision, recall, F1 from counts.""" + precision = MetricsAnalyzer._safe_div(tp, tp + fp) + recall = MetricsAnalyzer._safe_div(tp, tp + fn) + f1 = MetricsAnalyzer._safe_div(2 * precision * recall, precision + recall) + return precision, recall, f1 + + def analyze_category( + self, observations: list[CapabilityObservation], category: str, subcategory: str + ) -> CategoryMetrics: + """Compute aggregate metrics for a category/subcategory.""" + filtered = [ + o + for o in observations + if o.category == category and (not subcategory or o.subcategory == subcategory) + ] + if not filtered: + return CategoryMetrics(category=category, subcategory=subcategory) + + total = len(filtered) + skill_correct_count = sum(1 for o in filtered if o.skill_correct is True) + exec_correct_count = sum(1 for o in filtered if o.execution_mode_correct is True) + complexity_correct_count = sum(1 for o in filtered if o.complexity_correct is True) + task_success_count = sum(1 for o in filtered if o.task_succeeded) + avg_response_time = sum(o.response_time_ms for o in filtered) / total + + # For skill routing: compute per-skill PRF + # TP = correctly routed to expected skill + # FP = routed to wrong skill + # FN = expected skill but not routed to it + tp = skill_correct_count + fp = sum(1 for o in filtered if o.skill_correct is False and o.actual_skill is not None) + fn = sum(1 for o in filtered if o.skill_correct is False and o.expected_skill is not None) + precision, recall, f1 = self.compute_prf(tp, fp, fn) + + return CategoryMetrics( + category=category, + subcategory=subcategory, + total=total, + skill_correct=skill_correct_count, + skill_recall=round(recall, 4), + skill_precision=round(precision, 4), + skill_f1=round(f1, 4), + execution_mode_correct=exec_correct_count, + execution_mode_accuracy=round(self._safe_div(exec_correct_count, total), 4), + complexity_correct=complexity_correct_count, + complexity_accuracy=round(self._safe_div(complexity_correct_count, total), 4), + task_success_rate=round(self._safe_div(task_success_count, total), 4), + avg_response_time_ms=round(avg_response_time, 2), + ) + + def detect_overfitting( + self, observations: list[CapabilityObservation] + ) -> tuple[list[OverfittingResult], float]: + """Detect overfitting by comparing original vs paraphrase results. + + Returns (overfitting_results, overall_overfitting_score). + overfitting_score = 0.0 means no overfitting (paraphrases work as well as originals). + overfitting_score = 1.0 means complete overfitting (originals correct, paraphrases all wrong). + """ + originals = {o.benchmark_id: o for o in observations if not o.is_paraphrase} + paraphrases: dict[str, list[CapabilityObservation]] = defaultdict(list) + for o in observations: + if o.is_paraphrase: + paraphrases[o.benchmark_id].append(o) + + results: list[OverfittingResult] = [] + total_inconsistency = 0.0 + total_comparisons = 0 + + for bid, orig in originals.items(): + paras = paraphrases.get(bid, []) + if not paras: + continue + + orig_correct = orig.skill_correct is True + para_corrects = [p.skill_correct is True for p in paras] + + # Consistency: how many paraphrases match the original result + matches = sum(1 for pc in para_corrects if pc == orig_correct) + consistency_rate = self._safe_div(matches, len(para_corrects)) + + # Overfitted: original correct but paraphrases mostly wrong + is_overfitted = orig_correct and consistency_rate < 0.5 + + results.append( + OverfittingResult( + benchmark_id=bid, + original_correct=orig_correct, + paraphrase_results=para_corrects, + consistency_rate=round(consistency_rate, 4), + is_overfitted=is_overfitted, + ) + ) + + if orig_correct: + # Only count inconsistency when original was correct + total_inconsistency += 1.0 - consistency_rate + total_comparisons += 1 + + overfitting_score = self._safe_div(total_inconsistency, total_comparisons) + return results, round(overfitting_score, 4) + + def identify_weaknesses( + self, + category_metrics: list[CategoryMetrics], + overfitting_results: list[OverfittingResult], + ) -> list[WeaknessItem]: + """Identify intelligence weaknesses based on metrics analysis.""" + weaknesses: list[WeaknessItem] = [] + + for cm in category_metrics: + # Low skill F1 + if cm.skill_f1 < 0.5 and cm.total >= 2: + weaknesses.append( + WeaknessItem( + dimension=cm.category, + subcategory=cm.subcategory, + severity="critical" if cm.skill_f1 < 0.3 else "high", + description=f"技能路由F1过低 ({cm.skill_f1:.2f}),子类别: {cm.subcategory}", + evidence=f"召回率={cm.skill_recall:.2%}, 精确率={cm.skill_precision:.2%}, 样本数={cm.total}", + suggestion="改进该子类别的关键词匹配或意图分类逻辑", + ) + ) + elif cm.skill_f1 < 0.8 and cm.total >= 2: + weaknesses.append( + WeaknessItem( + dimension=cm.category, + subcategory=cm.subcategory, + severity="medium", + description=f"技能路由F1偏低 ({cm.skill_f1:.2f}),子类别: {cm.subcategory}", + evidence=f"召回率={cm.skill_recall:.2%}, 精确率={cm.skill_precision:.2%}, 样本数={cm.total}", + suggestion="微调路由阈值或增加更多意图示例", + ) + ) + + # Low execution mode accuracy + if cm.execution_mode_accuracy < 0.6 and cm.total >= 2: + weaknesses.append( + WeaknessItem( + dimension=cm.category, + subcategory=cm.subcategory, + severity="high" if cm.execution_mode_accuracy < 0.4 else "medium", + description=f"执行模式准确率过低 ({cm.execution_mode_accuracy:.2%}),子类别: {cm.subcategory}", + evidence=f"正确数={cm.execution_mode_correct}/{cm.total}", + suggestion="检查复杂度估算和模式选择逻辑", + ) + ) + + # Low task success rate + if cm.task_success_rate < 0.8 and cm.total >= 2: + weaknesses.append( + WeaknessItem( + dimension=cm.category, + subcategory=cm.subcategory, + severity="critical" if cm.task_success_rate < 0.5 else "high", + description=f"任务成功率过低 ({cm.task_success_rate:.2%}),子类别: {cm.subcategory}", + evidence=f"成功数={int(cm.task_success_rate * cm.total)}/{cm.total}", + suggestion="排查该子类别的任务执行失败原因", + ) + ) + + # Overfitting weaknesses + overfitted_cases = [r for r in overfitting_results if r.is_overfitted] + if overfitted_cases: + weaknesses.append( + WeaknessItem( + dimension="routing", + subcategory="overfitting", + severity="high", + description=f"检测到 {len(overfitted_cases)} 个用例存在过拟合", + evidence=f"过拟合用例: {', '.join(r.benchmark_id for r in overfitted_cases)}", + suggestion="增加更多样化的训练样本和同义改写,提升泛化能力", + ) + ) + + # Sort by severity + severity_order = {"critical": 0, "high": 1, "medium": 2, "low": 3} + weaknesses.sort(key=lambda w: severity_order.get(w.severity, 99)) + + return weaknesses + + # ═════════════════════════════════════════════════════════════════════ + # Root Cause Analysis Engine + # ═════════════════════════════════════════════════════════════════════ + + def analyze_root_causes( + self, + observations: list[CapabilityObservation], + category_metrics: list[CategoryMetrics], + overfitting_results: list[OverfittingResult], + weaknesses: list[WeaknessItem], + ) -> list[RootCause]: + """Perform root cause analysis based on observation data. + + Strategy: + 1. For each weakness, examine the raw observations to find patterns + 2. Cross-reference paraphrase vs original results for overfitting clues + 3. Analyze error messages for common failure modes + 4. Check recall vs precision imbalance to distinguish cause types + """ + root_causes: list[RootCause] = [] + originals = [o for o in observations if not o.is_paraphrase] + paraphrases = [o for o in observations if o.is_paraphrase] + + # --- Cause 1: Keyword gap (low recall = keywords not matching) --- + low_recall_cases = [ + o + for o in originals + if o.skill_correct is False and o.expected_skill is not None and o.actual_skill is None + ] + if low_recall_cases: + affected = [o.benchmark_id for o in low_recall_cases] + # Check if paraphrases also fail → confirms keyword gap + para_also_fail = sum( + 1 for p in paraphrases if p.benchmark_id in affected and p.skill_correct is False + ) + confidence = min(1.0, 0.5 + 0.1 * para_also_fail) if paraphrases else 0.6 + root_causes.append( + RootCause( + cause_type="keyword_gap", + cause_description="关键词覆盖不足:用户输入无法匹配到目标技能的关键词", + confidence=round(confidence, 2), + affected_cases=affected[:10], + detail=( + f"共 {len(low_recall_cases)} 个原始输入未能路由到期望技能。" + f"改写输入中也有 {para_also_fail} 个失败," + f"说明关键词库对同义表达的覆盖不足。" + f"受影响子类别: {', '.join(set(o.subcategory for o in low_recall_cases))}" + ), + ) + ) + + # --- Cause 2: Precision gap (wrong skill routed = intent ambiguous) --- + wrong_route_cases = [ + o + for o in originals + if o.skill_correct is False + and o.actual_skill is not None + and o.expected_skill is not None + ] + if wrong_route_cases: + affected = [o.benchmark_id for o in wrong_route_cases] + # Check which skills are being confused + confusion_pairs: dict[tuple[str, str], int] = defaultdict(int) + for o in wrong_route_cases: + confusion_pairs[(o.expected_skill, o.actual_skill)] += 1 + top_confusions = sorted(confusion_pairs.items(), key=lambda x: -x[1])[:5] + confusion_detail = "; ".join( + f"{exp}→{act}({cnt}次)" for (exp, act), cnt in top_confusions + ) + root_causes.append( + RootCause( + cause_type="intent_ambiguous", + cause_description="意图歧义:不同技能的关键词/意图描述重叠,导致路由混淆", + confidence=0.7, + affected_cases=affected[:10], + detail=f"技能混淆对: {confusion_detail}", + ) + ) + + # --- Cause 3: Complexity misjudge (execution mode wrong) --- + exec_wrong_cases = [o for o in originals if o.execution_mode_correct is False] + if exec_wrong_cases: + affected = [o.benchmark_id for o in exec_wrong_cases] + # Analyze direction of misjudgment + over_simplified = sum( + 1 + for o in exec_wrong_cases + if o.expected_complexity in ("high", "medium") + and o.actual_execution_mode == "direct" + ) + over_complicated = sum( + 1 + for o in exec_wrong_cases + if o.expected_complexity == "low" + and o.actual_execution_mode in ("react", "rewoo", "reflexion") + ) + direction = "" + if over_simplified > over_complicated: + direction = "倾向低估复杂度(将复杂任务误判为简单直接调用)" + elif over_complicated > over_simplified: + direction = "倾向高估复杂度(将简单任务误判为需要多步推理)" + else: + direction = "复杂度误判方向不明确,双向均有偏差" + + root_causes.append( + RootCause( + cause_type="complexity_misjudge", + cause_description=f"复杂度估算偏差:{direction}", + confidence=0.75, + affected_cases=affected[:10], + detail=( + f"共 {len(exec_wrong_cases)} 个执行模式判断错误。" + f"低估复杂度 {over_simplified} 次,高估复杂度 {over_complicated} 次。" + f"受影响子类别: {', '.join(set(o.subcategory for o in exec_wrong_cases))}" + ), + ) + ) + + # --- Cause 4: Fallback missing (no skill matched, task failed) --- + fallback_fail_cases = [ + o for o in originals if o.expected_skill is None and not o.task_succeeded + ] + if fallback_fail_cases: + affected = [o.benchmark_id for o in fallback_fail_cases] + root_causes.append( + RootCause( + cause_type="fallback_missing", + cause_description="回退机制不足:无匹配技能时,直接聊天模式未能正常处理", + confidence=0.65, + affected_cases=affected[:10], + detail=( + f"共 {len(fallback_fail_cases)} 个无技能匹配的任务执行失败。" + f"错误信息: {'; '.join(set(o.error_message or 'N/A' for o in fallback_fail_cases[:5]))}" + ), + ) + ) + + # --- Cause 5: Overfit pattern (paraphrases fail while original succeeds) --- + overfitted = [r for r in overfitting_results if r.is_overfitted] + if overfitted: + affected = [r.benchmark_id for r in overfitted] + # Analyze what kind of paraphrases fail + para_fail_details: list[str] = [] + for r in overfitted: + fail_count = sum(1 for ok in r.paraphrase_results if not ok) + para_fail_details.append( + f"{r.benchmark_id}({fail_count}/{len(r.paraphrase_results)}改写失败)" + ) + root_causes.append( + RootCause( + cause_type="overfit_pattern", + cause_description="路由过拟合:对特定表述形式过度敏感,同义改写后路由失败", + confidence=0.85, + affected_cases=affected, + detail=( + f"共 {len(overfitted)} 个用例存在过拟合。" + f"详情: {'; '.join(para_fail_details)}。" + f"说明路由逻辑对输入的具体措辞过于敏感,缺乏语义层面的泛化能力。" + ), + ) + ) + + # --- Cause 6: Quality threshold (task succeeded but output poor) --- + success_but_wrong = [o for o in originals if o.task_succeeded and o.skill_correct is False] + if len(success_but_wrong) >= 2: + affected = [o.benchmark_id for o in success_but_wrong] + root_causes.append( + RootCause( + cause_type="quality_threshold", + cause_description="质量门控阈值过低:任务虽成功完成但输出了错误结果", + confidence=0.6, + affected_cases=affected[:10], + detail=( + f"共 {len(success_but_wrong)} 个任务虽然HTTP成功但路由到了错误技能。" + f"质量门控未能拦截这些错误路由的结果。" + ), + ) + ) + + # --- Cause 7: Config error (HTTP errors) --- + error_cases = [o for o in originals if o.error_message and not o.task_succeeded] + if error_cases: + # Group by error pattern + error_patterns: dict[str, int] = defaultdict(int) + for o in error_cases: + # Simplify error message to pattern + msg = (o.error_message or "")[:80] + error_patterns[msg] += 1 + top_errors = sorted(error_patterns.items(), key=lambda x: -x[1])[:3] + error_detail = "; ".join(f"{msg}({cnt}次)" for msg, cnt in top_errors) + root_causes.append( + RootCause( + cause_type="config_error", + cause_description="配置或服务端错误:请求处理过程中出现异常", + confidence=0.5, + affected_cases=[o.benchmark_id for o in error_cases[:10]], + detail=f"常见错误: {error_detail}", + ) + ) + + # Sort by confidence + root_causes.sort(key=lambda rc: -rc.confidence) + return root_causes + + # ═════════════════════════════════════════════════════════════════════ + # Improvement Strategy Planner + # ═════════════════════════════════════════════════════════════════════ + + def plan_improvements( + self, + weaknesses: list[WeaknessItem], + root_causes: list[RootCause], + ) -> list[ImprovementPlan]: + """Generate improvement plans based on weaknesses and root causes.""" + plans: list[ImprovementPlan] = [] + action_counter = 0 + + # Map root causes by type for quick lookup + causes_by_type: dict[str, list[RootCause]] = defaultdict(list) + for rc in root_causes: + causes_by_type[rc.cause_type].append(rc) + + # --- Plan for keyword_gap --- + if "keyword_gap" in causes_by_type: + cause = causes_by_type["keyword_gap"][0] + actions: list[ImprovementAction] = [] + action_counter += 1 + actions.append( + ImprovementAction( + action_id=f"ACT-{action_counter:03d}", + title="扩展技能关键词同义词库", + description=( + "为每个技能的 intent.keywords 添加更多同义词、近义词和用户常见表述。" + "重点补充中文变体、口语化表达和行业术语。" + ), + target_module="configs/skills/*.yaml → intent.keywords", + priority="P0", + expected_impact=f"预计提升召回率 15~30%,影响 {len(cause.affected_cases)} 个用例", + effort="small", + related_causes=["keyword_gap"], + verification="重新运行E2E回测,验证受影响用例的召回率提升", + ) + ) + action_counter += 1 + actions.append( + ImprovementAction( + action_id=f"ACT-{action_counter:03d}", + title="引入语义相似度匹配(Layer 1.5)", + description=( + "在 CostAwareRouter 的 Layer 1.5 SemanticRouter 中," + "使用向量嵌入计算用户输入与技能描述的语义相似度," + "弥补关键词精确匹配的不足。" + ), + target_module="src/agentkit/chat/skill_routing.py", + priority="P1", + expected_impact="预计提升召回率 20~40%,显著改善同义改写场景", + effort="large", + related_causes=["keyword_gap", "overfit_pattern"], + verification="运行过拟合检测回测,验证改写一致性提升至 >80%", + ) + ) + plans.append( + ImprovementPlan( + weakness_description=cause.cause_description, + root_causes=causes_by_type["keyword_gap"], + actions=actions, + overall_strategy=( + "短期:扩充关键词库(低成本高收益);" + "中期:引入语义匹配层(高成本高收益);" + "长期:基于用户真实查询日志持续优化关键词库" + ), + ) + ) + + # --- Plan for intent_ambiguous --- + if "intent_ambiguous" in causes_by_type: + cause = causes_by_type["intent_ambiguous"][0] + actions = [] + action_counter += 1 + actions.append( + ImprovementAction( + action_id=f"ACT-{action_counter:03d}", + title="为易混淆技能添加互斥关键词", + description=( + "在技能配置中为容易混淆的技能对添加互斥关键词(disambiguation_keywords)," + "当用户输入同时匹配多个技能时,优先选择包含互斥关键词的技能。" + ), + target_module="configs/skills/*.yaml → intent.disambiguation_keywords", + priority="P1", + expected_impact="预计提升精确率 10~25%,减少技能混淆", + effort="small", + related_causes=["intent_ambiguous"], + verification="运行歧义消解回测,验证路由精确率提升", + ) + ) + action_counter += 1 + actions.append( + ImprovementAction( + action_id=f"ACT-{action_counter:03d}", + title="实现LLM二次分类消歧", + description=( + "当 Layer 0/1 路由到多个候选技能时," + "调用 LLM quick_classify 进行二次意图判断," + "选择最匹配的技能。" + ), + target_module="src/agentkit/chat/skill_routing.py → Layer 1", + priority="P2", + expected_impact="预计提升精确率 15~30%,但增加 ~500ms 延迟和 ~100 tokens", + effort="medium", + related_causes=["intent_ambiguous"], + verification="运行歧义消解回测,对比延迟和精确率变化", + ) + ) + plans.append( + ImprovementPlan( + weakness_description=cause.cause_description, + root_causes=causes_by_type["intent_ambiguous"], + actions=actions, + overall_strategy=( + "短期:添加互斥关键词消歧;" + "中期:启用LLM二次分类;" + "长期:训练专用意图分类模型替代规则匹配" + ), + ) + ) + + # --- Plan for complexity_misjudge --- + if "complexity_misjudge" in causes_by_type: + cause = causes_by_type["complexity_misjudge"][0] + actions = [] + action_counter += 1 + actions.append( + ImprovementAction( + action_id=f"ACT-{action_counter:03d}", + title="优化复杂度估算启发式规则", + description=( + "调整 HeuristicClassifier 的复杂度评分权重:" + "增加任务动词(分析/研究/设计)的权重," + "降低简单问答动词(是什么/多少)的权重。" + ), + target_module="src/agentkit/chat/skill_routing.py → HeuristicClassifier", + priority="P1", + expected_impact="预计提升执行模式准确率 10~20%", + effort="small", + related_causes=["complexity_misjudge"], + verification="运行执行模式回测,验证准确率提升", + ) + ) + action_counter += 1 + actions.append( + ImprovementAction( + action_id=f"ACT-{action_counter:03d}", + title="引入任务复杂度校准数据集", + description=( + "收集标注了复杂度等级的真实用户查询," + "构建校准数据集,定期评估和调整复杂度阈值。" + ), + target_module="tests/e2e/benchmark_dataset.py", + priority="P2", + expected_impact="持续提升复杂度判断准确性", + effort="medium", + related_causes=["complexity_misjudge"], + verification="每次调整后运行回测,对比前后F1变化", + ) + ) + plans.append( + ImprovementPlan( + weakness_description=cause.cause_description, + root_causes=causes_by_type["complexity_misjudge"], + actions=actions, + overall_strategy=( + "短期:调整启发式规则权重;" + "中期:构建复杂度校准数据集;" + "长期:训练复杂度评估模型替代规则" + ), + ) + ) + + # --- Plan for fallback_missing --- + if "fallback_missing" in causes_by_type: + cause = causes_by_type["fallback_missing"][0] + actions = [] + action_counter += 1 + actions.append( + ImprovementAction( + action_id=f"ACT-{action_counter:03d}", + title="增强DIRECT_CHAT回退路径", + description=( + "当无技能匹配时,确保DIRECT_CHAT模式能正常处理请求:" + "1) 检查默认Agent是否正确初始化;" + "2) 确保无技能时不会触发空指针异常;" + "3) 添加友好的降级提示。" + ), + target_module="src/agentkit/chat/skill_routing.py → _fallback_direct_chat", + priority="P0", + expected_impact="确保100%的请求都有回退处理,消除任务失败", + effort="small", + related_causes=["fallback_missing"], + verification="运行回退场景回测,验证所有无匹配请求均成功", + ) + ) + plans.append( + ImprovementPlan( + weakness_description=cause.cause_description, + root_causes=causes_by_type["fallback_missing"], + actions=actions, + overall_strategy=( + "短期:修复回退路径确保基本可用;" + "中期:优化回退模式的回答质量;" + "长期:基于用户反馈自动发现新技能需求" + ), + ) + ) + + # --- Plan for overfit_pattern --- + if "overfit_pattern" in causes_by_type: + cause = causes_by_type["overfit_pattern"][0] + actions = [] + action_counter += 1 + actions.append( + ImprovementAction( + action_id=f"ACT-{action_counter:03d}", + title="添加意图描述和示例(intent.description + examples)", + description=( + "为每个技能添加 intent.description(自然语言描述)和 intent.examples(示例查询)," + "使路由器能理解语义层面的意图,而不仅依赖关键词精确匹配。" + ), + target_module="configs/skills/*.yaml → intent.description / intent.examples", + priority="P0", + expected_impact="预计提升改写一致性 20~40%", + effort="small", + related_causes=["overfit_pattern", "keyword_gap"], + verification="运行过拟合检测回测,验证改写一致性提升", + ) + ) + action_counter += 1 + actions.append( + ImprovementAction( + action_id=f"ACT-{action_counter:03d}", + title="实现意图泛化测试CI", + description=( + "在CI中集成意图泛化回测:每次修改路由逻辑或技能配置后," + "自动运行包含改写的回测用例,确保不引入新的过拟合。" + ), + target_module=".github/workflows/ + tests/e2e/", + priority="P2", + expected_impact="防止过拟合回归,持续监控泛化能力", + effort="medium", + related_causes=["overfit_pattern"], + verification="CI流水线中自动运行回测并检查过拟合分数", + ) + ) + plans.append( + ImprovementPlan( + weakness_description=cause.cause_description, + root_causes=causes_by_type["overfit_pattern"], + actions=actions, + overall_strategy=( + "短期:补充意图描述和示例;" + "中期:引入语义匹配(同keyword_gap方案);" + "长期:建立意图泛化CI防线" + ), + ) + ) + + # --- Plan for quality_threshold --- + if "quality_threshold" in causes_by_type: + cause = causes_by_type["quality_threshold"][0] + actions = [] + action_counter += 1 + actions.append( + ImprovementAction( + action_id=f"ACT-{action_counter:03d}", + title="增强质量门控的技能匹配验证", + description=( + "在QualityGate中增加技能匹配验证:" + "检查输出是否与路由到的技能的能力范围一致," + "如果不一致则触发重试或降级。" + ), + target_module="src/agentkit/quality/gate.py", + priority="P1", + expected_impact="减少错误路由导致的低质量输出", + effort="medium", + related_causes=["quality_threshold"], + verification="运行质量门控回测,验证错误路由拦截率", + ) + ) + plans.append( + ImprovementPlan( + weakness_description=cause.cause_description, + root_causes=causes_by_type["quality_threshold"], + actions=actions, + overall_strategy=( + "短期:增加技能匹配验证;" + "中期:引入输出质量评分模型;" + "长期:实现自动质量回归检测" + ), + ) + ) + + # --- Plan for config_error --- + if "config_error" in causes_by_type: + cause = causes_by_type["config_error"][0] + actions = [] + action_counter += 1 + actions.append( + ImprovementAction( + action_id=f"ACT-{action_counter:03d}", + title="修复服务端配置和异常处理", + description=( + "根据错误信息排查服务端配置问题:" + "1) 检查API路由注册是否完整;" + "2) 增加输入校验和错误提示;" + "3) 确保所有异常都有友好的错误响应。" + ), + target_module="src/agentkit/server/routes/", + priority="P0", + expected_impact="消除服务端错误,提升任务成功率", + effort="small", + related_causes=["config_error"], + verification="重新运行E2E回测,验证HTTP错误率降低", + ) + ) + plans.append( + ImprovementPlan( + weakness_description=cause.cause_description, + root_causes=causes_by_type["config_error"], + actions=actions, + overall_strategy=( + "短期:修复已知配置错误;" + "中期:增加输入校验和错误处理;" + "长期:建立配置变更的自动化验证" + ), + ) + ) + + return plans + + def analyze_alignment(self, observations: list[CapabilityObservation]) -> dict[str, Any]: + """Analyze alignment guard and cascade detector metrics. + + Returns a dict with: + - total_alignment_tests: number of observations in alignment category + - violation_count: total constraint violations + - violation_rate: ratio of tests with at least one violation + - cascade_alert_count: number of cascade alerts triggered + - cascade_alert_rate: ratio of tests that triggered cascade + - neg_constraint_pass_rate: pass rate for negative constraints + - pos_constraint_pass_rate: pass rate for positive constraints + """ + alignment_obs = [o for o in observations if o.category == "alignment"] + if not alignment_obs: + return { + "total_alignment_tests": 0, + "violation_count": 0, + "violation_rate": 0.0, + "cascade_alert_count": 0, + "cascade_alert_rate": 0.0, + "neg_constraint_pass_rate": 0.0, + "pos_constraint_pass_rate": 0.0, + } + + total = len(alignment_obs) + with_violations = sum(1 for o in alignment_obs if o.alignment_violations > 0) + total_violations = sum(o.alignment_violations for o in alignment_obs) + with_cascade = sum(1 for o in alignment_obs if o.cascade_alert) + + # Separate by subcategory for neg/pos constraint pass rates + neg_obs = [o for o in alignment_obs if o.subcategory == "negative_constraint"] + pos_obs = [o for o in alignment_obs if o.subcategory == "positive_constraint"] + + neg_pass_rate = self._safe_div( + sum(1 for o in neg_obs if o.alignment_violations == 0), + len(neg_obs), + ) + pos_pass_rate = self._safe_div( + sum(1 for o in pos_obs if o.alignment_violations == 0), + len(pos_obs), + ) + + return { + "total_alignment_tests": total, + "violation_count": total_violations, + "violation_rate": round(self._safe_div(with_violations, total), 4), + "cascade_alert_count": with_cascade, + "cascade_alert_rate": round(self._safe_div(with_cascade, total), 4), + "neg_constraint_pass_rate": round(neg_pass_rate, 4), + "pos_constraint_pass_rate": round(pos_pass_rate, 4), + } + + def generate_report(self, collector: MetricsCollector) -> CapabilityReport: + """Generate a full capability analysis report from collected observations.""" + observations = collector.observations + originals = collector.get_original_observations() + + # Compute overall metrics + total = len(originals) + if total > 0: + tp = sum(1 for o in originals if o.skill_correct is True) + fp = sum( + 1 for o in originals if o.skill_correct is False and o.actual_skill is not None + ) + fn = sum( + 1 for o in originals if o.skill_correct is False and o.expected_skill is not None + ) + overall_precision, overall_recall, overall_f1 = self.compute_prf(tp, fp, fn) + + exec_correct = sum(1 for o in originals if o.execution_mode_correct is True) + overall_exec_accuracy = self._safe_div(exec_correct, total) + + task_success = sum(1 for o in originals if o.task_succeeded) + overall_success_rate = self._safe_div(task_success, total) + else: + overall_precision = overall_recall = overall_f1 = 0.0 + overall_exec_accuracy = overall_success_rate = 0.0 + + # Compute per-category metrics + categories: set[tuple[str, str]] = {(o.category, o.subcategory) for o in originals} + category_metrics = [ + self.analyze_category(observations, cat, subcat) for cat, subcat in sorted(categories) + ] + + # Detect overfitting + overfitting_results, overfitting_score = self.detect_overfitting(observations) + + # Identify weaknesses + weaknesses = self.identify_weaknesses(category_metrics, overfitting_results) + + # Root cause analysis + root_causes = self.analyze_root_causes( + observations, category_metrics, overfitting_results, weaknesses + ) + + # Improvement strategy planning + improvement_plans = self.plan_improvements(weaknesses, root_causes) + + return CapabilityReport( + generated_at=datetime.now(timezone.utc).isoformat(), + total_observations=len(observations), + overall_skill_recall=round(overall_recall, 4), + overall_skill_precision=round(overall_precision, 4), + overall_skill_f1=round(overall_f1, 4), + overall_execution_mode_accuracy=round(overall_exec_accuracy, 4), + overall_task_success_rate=round(overall_success_rate, 4), + category_metrics=category_metrics, + overfitting_results=overfitting_results, + overfitting_score=overfitting_score, + weaknesses=weaknesses, + root_causes=root_causes, + improvement_plans=improvement_plans, + raw_observations=observations, + ) + + +# ═══════════════════════════════════════════════════════════════════════════ +# 4. Metrics Reporter +# ═══════════════════════════════════════════════════════════════════════════ + + +class MetricsReporter: + """Generate human-readable and machine-readable reports.""" + + @staticmethod + def to_json(report: CapabilityReport, path: str) -> None: + """Save report as JSON.""" + with open(path, "w", encoding="utf-8") as f: + json.dump(report.model_dump(), f, ensure_ascii=False, indent=2) + + @staticmethod + def to_text(report: CapabilityReport) -> str: + """Generate plain-text summary report in Chinese.""" + lines: list[str] = [] + + lines.append("=" * 72) + lines.append(" AgentKit 智能化能力分析报告") + lines.append(f" 生成时间: {report.generated_at}") + lines.append("=" * 72) + lines.append("") + + # Overall metrics + lines.append("── 总体指标 ──────────────────────────────────────────────") + lines.append(f" 观测总数: {report.total_observations}") + lines.append(f" 技能路由召回率: {report.overall_skill_recall:.2%}") + lines.append(f" 技能路由精确率: {report.overall_skill_precision:.2%}") + lines.append(f" 技能路由F1: {report.overall_skill_f1:.2%}") + lines.append(f" 执行模式准确率: {report.overall_execution_mode_accuracy:.2%}") + lines.append(f" 任务成功率: {report.overall_task_success_rate:.2%}") + lines.append(f" 过拟合分数: {report.overfitting_score:.2%}") + lines.append("") + + # Per-category breakdown + lines.append("── 分类明细 ──────────────────────────────────────────────") + for cm in report.category_metrics: + cat_label = { + "routing": "路由", + "execution": "执行", + "quality": "质量", + "team": "团队", + "consistency": "一致性", + }.get(cm.category, cm.category) + subcat_label = { + "keyword_match": "关键词匹配", + "explicit_prefix": "显式前缀", + "greeting": "问候语", + "identity": "身份识别", + "disambiguation": "歧义消解", + "fallback": "回退处理", + "complexity_low": "低复杂度", + "complexity_high": "高复杂度", + "intent_variant": "意图变体", + "direct_mode": "直接模式", + "react_mode": "ReAct模式", + "quality_gate": "质量门控", + "output_std": "输出标准化", + "explicit_team": "显式团队", + "deterministic": "确定性", + "overfitting": "过拟合", + }.get(cm.subcategory, cm.subcategory) + lines.append(f" [{cat_label}/{subcat_label}]") + lines.append( + f" 样本数={cm.total} 召回率={cm.skill_recall:.2%} " + f"精确率={cm.skill_precision:.2%} F1={cm.skill_f1:.2%}" + ) + lines.append( + f" 执行模式准确率={cm.execution_mode_accuracy:.2%} " + f"成功率={cm.task_success_rate:.2%} " + f"平均耗时={cm.avg_response_time_ms:.0f}ms" + ) + lines.append("") + + # Overfitting analysis + if report.overfitting_results: + lines.append("── 过拟合分析 ────────────────────────────────────────────") + for r in report.overfitting_results: + status = "⚠ 过拟合" if r.is_overfitted else "✓ 正常" + orig_label = "✓" if r.original_correct else "✗" + lines.append( + f" [{status}] {r.benchmark_id}: " + f"原始输入={orig_label}, " + f"改写一致性={r.consistency_rate:.0%}" + ) + lines.append("") + + # Semantic router analysis + semantic_cats = [cm for cm in report.category_metrics if cm.category == "semantic_router"] + if semantic_cats: + lines.append("── 语义路由分析 ──────────────────────────────────────────") + for cm in semantic_cats: + lines.append( + f" [{cm.subcategory}] 样本数={cm.total} " + f"精确率={cm.skill_precision:.2%} F1={cm.skill_f1:.2%}" + ) + lines.append("") + + # Team routing analysis + team_cats = [cm for cm in report.category_metrics if cm.category == "team"] + if team_cats: + lines.append("── 团队路由分析 ──────────────────────────────────────────") + for cm in team_cats: + lines.append( + f" [{cm.subcategory}] 样本数={cm.total} " + f"成功率={cm.task_success_rate:.2%} " + f"执行模式准确率={cm.execution_mode_accuracy:.2%}" + ) + lines.append("") + + # Alignment guard analysis + alignment_obs = [o for o in report.raw_observations if o.category == "alignment"] + if alignment_obs: + analyzer = MetricsAnalyzer() + alignment_metrics = analyzer.analyze_alignment(report.raw_observations) + lines.append("── 对齐守卫分析 ──────────────────────────────────────────") + lines.append(f" 测试总数: {alignment_metrics['total_alignment_tests']}") + lines.append(f" 约束违规总数: {alignment_metrics['violation_count']}") + lines.append(f" 违规率: {alignment_metrics['violation_rate']:.2%}") + lines.append( + f" 否定约束通过率: {alignment_metrics['neg_constraint_pass_rate']:.2%}" + ) + lines.append( + f" 肯定约束通过率: {alignment_metrics['pos_constraint_pass_rate']:.2%}" + ) + lines.append(f" 级联告警次数: {alignment_metrics['cascade_alert_count']}") + lines.append(f" 级联告警率: {alignment_metrics['cascade_alert_rate']:.2%}") + lines.append("") + + # Weakness analysis + if report.weaknesses: + lines.append("── 智能化短板识别 ────────────────────────────────────────") + for w in report.weaknesses: + icon = {"critical": "🔴", "high": "🟠", "medium": "🟡", "low": "🟢"}.get( + w.severity, "⚪" + ) + severity_label = { + "critical": "严重", + "high": "高", + "medium": "中", + "low": "低", + }.get(w.severity, w.severity) + lines.append(f" {icon} [{severity_label}] {w.description}") + lines.append(f" 证据: {w.evidence}") + lines.append(f" 建议: {w.suggestion}") + lines.append("") + else: + lines.append("── 未检测到显著短板 ────────────────────────────────────") + lines.append("") + + # Root cause analysis + if report.root_causes: + lines.append("── 根因分析 ──────────────────────────────────────────────") + cause_type_labels = { + "keyword_gap": "关键词覆盖不足", + "intent_ambiguous": "意图歧义", + "complexity_misjudge": "复杂度估算偏差", + "fallback_missing": "回退机制不足", + "overfit_pattern": "路由过拟合", + "quality_threshold": "质量门控阈值过低", + "config_error": "配置/服务端错误", + "tool_missing": "工具缺失", + } + for rc in report.root_causes: + type_label = cause_type_labels.get(rc.cause_type, rc.cause_type) + conf_bar = "█" * int(rc.confidence * 10) + "░" * (10 - int(rc.confidence * 10)) + lines.append(f" ▸ [{type_label}] 置信度: {conf_bar} {rc.confidence:.0%}") + lines.append(f" 原因: {rc.cause_description}") + if rc.detail: + lines.append(f" 详情: {rc.detail}") + if rc.affected_cases: + lines.append( + f" 受影响用例: {', '.join(rc.affected_cases[:5])}" + f"{'...' if len(rc.affected_cases) > 5 else ''}" + ) + lines.append("") + + # Improvement strategy + if report.improvement_plans: + lines.append("── 改进策略规划 ──────────────────────────────────────────") + for i, plan in enumerate(report.improvement_plans, 1): + lines.append(f" ┌─ 策略 {i}: {plan.weakness_description}") + lines.append(f" │ 总体策略: {plan.overall_strategy}") + lines.append(" │") + for action in plan.actions: + priority_icon = {"P0": "🔴", "P1": "🟠", "P2": "🟡", "P3": "🟢"}.get( + action.priority, "⚪" + ) + effort_label = {"small": "小", "medium": "中", "large": "大"}.get( + action.effort, action.effort + ) + lines.append(f" │ {priority_icon} [{action.priority}] {action.title}") + lines.append(f" │ 目标模块: {action.target_module}") + lines.append(f" │ 具体操作: {action.description}") + lines.append(f" │ 预期影响: {action.expected_impact}") + lines.append(f" │ 工作量: {effort_label}") + lines.append(f" │ 验证方式: {action.verification}") + lines.append(" │") + lines.append(f" └{'─' * 60}") + lines.append("") + + lines.append("=" * 72) + return "\n".join(lines) + + @staticmethod + def save_report(report: CapabilityReport, output_dir: str) -> dict[str, str]: + """Save both JSON and text reports. Returns paths to saved files.""" + os.makedirs(output_dir, exist_ok=True) + + json_path = os.path.join(output_dir, "capability_report.json") + text_path = os.path.join(output_dir, "capability_report.txt") + + MetricsReporter.to_json(report, json_path) + with open(text_path, "w", encoding="utf-8") as f: + f.write(MetricsReporter.to_text(report)) + + return {"json": json_path, "text": text_path} diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py new file mode 100644 index 0000000..e01a6dc --- /dev/null +++ b/tests/e2e/conftest.py @@ -0,0 +1,413 @@ +"""E2E test fixtures: server lifecycle, CLI runner, API client, WebSocket helpers. + +Design principles: + 1. Start a real uvicorn server with MockLLMProvider once per session + 2. CLI tests use subprocess to invoke `agentkit` commands (OpenCLI pattern) + 3. API tests use httpx against the live server + 4. WebSocket tests use the `websockets` library against the live server + 5. All tests are idempotent and repeatable +""" + +import asyncio +import json +import os +import shutil +import subprocess +import sys +import time +from typing import Any, Generator + +import httpx +import pytest + +# --------------------------------------------------------------------------- +# Markers +# --------------------------------------------------------------------------- + +pytestmark = pytest.mark.integration + + +def pytest_configure(config: pytest.Config) -> None: + config.addinivalue_line("markers", "e2e: end-to-end backtest (requires server)") + config.addinivalue_line("markers", "e2e_basic: basic function correctness test") + config.addinivalue_line("markers", "e2e_capability: agent intelligence capability test") + # Initialize session-scoped metrics collector + from tests.e2e.capability_metrics import MetricsCollector + + config._e2e_metrics_collector = MetricsCollector() # type: ignore[attr-defined] + + +def pytest_sessionfinish(session: pytest.Session, exitstatus: int) -> None: + """After all tests, generate capability analysis report if data was collected.""" + collector = session.config._e2e_metrics_collector # type: ignore[attr-defined] + if collector is None or not collector.observations: + return + + from tests.e2e.capability_metrics import MetricsAnalyzer, MetricsReporter + + analyzer = MetricsAnalyzer() + report = analyzer.generate_report(collector) + + output_dir = os.path.join(os.path.dirname(__file__), "..", "..", "test-results", "e2e") + paths = MetricsReporter.save_report(report, output_dir) + + # Print summary to console + print("\n" + MetricsReporter.to_text(report)) + print(f"\nReport saved to: {paths['json']}") + print(f"Text report: {paths['text']}") + + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +E2E_HOST = "127.0.0.1" +E2E_PORT = 18765 # dedicated port to avoid conflict with dev server +E2E_BASE_URL = f"http://{E2E_HOST}:{E2E_PORT}" +E2E_WS_URL = f"ws://{E2E_HOST}:{E2E_PORT}" +E2E_API_KEY = "ak_live_e2e_test_key_000000000000000000000000000000000000000000000000" + + +# --------------------------------------------------------------------------- +# Mock LLM Provider (deterministic responses for backtest) +# --------------------------------------------------------------------------- + +MOCK_LLM_RESPONSES: dict[str, str] = { + # Default / generic + "default": '{"result": "mock response", "content": "This is a mock LLM response for e2e testing."}', + # Content generation + "content_writer": '{"result": "article generated", "content": "AI is transforming industries by enabling automation and intelligent decision-making."}', + # Translation + "translator": '{"result": "translation complete", "content": "This is the translated text."}', + # Summarization + "summarizer": '{"result": "summary generated", "content": "Key points: 1) Topic overview 2) Main findings 3) Conclusion."}', + # Code generation + "coder": '{"result": "code generated", "content": "def hello():\\n print(\\"Hello, World!\\")"}', + # Analysis + "analyst": '{"result": "analysis complete", "content": "The data shows a positive trend with 15% growth."}', + # ReAct tool call + "react_tool_call": '{"thought": "I need to search for information", "action": "web_search", "action_input": {"query": "test"}, "observation": "Search results found"}', + # ReAct final answer + "react_final": '{"thought": "I have enough information", "final_answer": "Based on my analysis, the answer is 42."}', +} + + +def _build_mock_env(tmp_path: Any) -> dict[str, str]: + """Build environment variables for a server with MockLLMProvider.""" + env = os.environ.copy() + env.update( + { + "AGENTKIT_E2E_MODE": "1", + "AGENTKIT_E2E_MOCK_RESPONSES": json.dumps(MOCK_LLM_RESPONSES), + "AGENTKIT_API_KEY": E2E_API_KEY, + "AGENTKIT_WS_TIMEOUT": "0", + # Disable real LLM calls + "OPENAI_API_KEY": "", + "ANTHROPIC_API_KEY": "", + "DEEPSEEK_API_KEY": "", + } + ) + return env + + +# --------------------------------------------------------------------------- +# Server lifecycle fixture +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def e2e_server(tmp_path_factory: pytest.TempPathFactory) -> Generator[str, None, None]: + """Start a real AgentKit server for the entire E2E session. + + Returns the base URL (e.g. http://127.0.0.1:18765). + The server uses MockLLMProvider so no real LLM calls are made. + """ + tmp_path = tmp_path_factory.mktemp("e2e_server") + + # Generate a minimal agentkit.yaml for the test server + config_dir = tmp_path / "config" + config_dir.mkdir() + config_file = config_dir / "agentkit.yaml" + + import yaml + + config_file.write_text( + yaml.dump( + { + "server": {"host": E2E_HOST, "port": E2E_PORT}, + "llm": {"default_provider": "mock", "providers": {"mock": {"type": "mock"}}}, + "auth": {"enabled": True, "api_keys": [E2E_API_KEY]}, + } + ) + ) + + env = _build_mock_env(tmp_path) + env["AGENTKIT_CONFIG"] = str(config_file) + + # Start server as subprocess + proc = subprocess.Popen( + [ + sys.executable, + "-m", + "agentkit.cli.main", + "serve", + "--host", + E2E_HOST, + "--port", + str(E2E_PORT), + ], + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=str(tmp_path), + ) + + # Wait for server to be ready (max 30s) + base_url = E2E_BASE_URL + deadline = time.monotonic() + 30 + ready = False + while time.monotonic() < deadline: + try: + resp = httpx.get(f"{base_url}/api/v1/health", timeout=2) + if resp.status_code == 200: + ready = True + break + except httpx.ConnectError: + pass + time.sleep(0.5) + + if not ready: + proc.terminate() + stdout, stderr = proc.communicate(timeout=5) + pytest.fail( + f"E2E server failed to start within 30s.\n" + f"stdout: {stdout.decode()[:2000]}\n" + f"stderr: {stderr.decode()[:2000]}" + ) + + yield base_url + + # Teardown + proc.terminate() + try: + proc.wait(timeout=10) + except subprocess.TimeoutExpired: + proc.kill() + + +# --------------------------------------------------------------------------- +# API client fixture +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def api_client(e2e_server: str) -> httpx.Client: + """Synchronous httpx client configured for the E2E server.""" + return httpx.Client( + base_url=e2e_server, + headers={"X-API-Key": E2E_API_KEY, "Content-Type": "application/json"}, + timeout=30, + ) + + +# --------------------------------------------------------------------------- +# CLI runner (subprocess-based, OpenCLI pattern) +# --------------------------------------------------------------------------- + + +class CLIRunner: + """Simulate user CLI operations via subprocess. + + This is the 'OpenCLI' pattern: invoke the real `agentkit` binary + as a subprocess and capture its output, exactly as a user would. + """ + + def __init__(self, env: dict[str, str] | None = None, cwd: str | None = None): + self.env = env or os.environ.copy() + self.cwd = cwd + + def _resolve_agentkit_cmd(self) -> list[str]: + """Resolve the agentkit command to use. + + Prefer the installed `agentkit` script (handles Rich/Typer output correctly), + fall back to `python -m agentkit.cli.main`. + """ + agentkit_path = shutil.which("agentkit") + if agentkit_path: + return [agentkit_path] + return [sys.executable, "-m", "agentkit.cli.main"] + + def run(self, args: list[str], timeout: int = 30) -> subprocess.CompletedProcess[str]: + """Run an agentkit CLI command and return the result. + + Args: + args: CLI arguments, e.g. ["version"] or ["task", "submit", ...] + timeout: maximum seconds to wait + + Returns: + CompletedProcess with stdout, stderr, returncode + """ + cmd = [*self._resolve_agentkit_cmd(), *args] + return subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=timeout, + env=self.env, + cwd=self.cwd, + ) + + def run_server_command( + self, args: list[str], server_url: str, timeout: int = 30 + ) -> subprocess.CompletedProcess[str]: + """Run a CLI command that requires --server-url.""" + full_args = [*args, "--server-url", server_url] + return self.run(full_args, timeout=timeout) + + +@pytest.fixture +def cli_runner(tmp_path: Any) -> CLIRunner: + """CLI runner with isolated environment.""" + env = os.environ.copy() + env["AGENTKIT_CONFIG_DIR"] = str(tmp_path / "config") + env["AGENTKIT_WS_TIMEOUT"] = "0" + # Prevent onboarding prompts + env["AGENTKIT_E2E_MODE"] = "1" + return CLIRunner(env=env, cwd=str(tmp_path)) + + +@pytest.fixture(scope="session") +def cli_runner_session(e2e_server: str) -> CLIRunner: + """CLI runner configured to talk to the E2E server.""" + env = os.environ.copy() + env["AGENTKIT_SERVER_URL"] = e2e_server + env["AGENTKIT_API_KEY"] = E2E_API_KEY + env["AGENTKIT_WS_TIMEOUT"] = "0" + env["AGENTKIT_E2E_MODE"] = "1" + return CLIRunner(env=env) + + +# --------------------------------------------------------------------------- +# WebSocket helper +# --------------------------------------------------------------------------- + + +class WSChatHelper: + """Helper for WebSocket chat E2E tests.""" + + def __init__(self, base_ws_url: str, api_key: str): + self.base_ws_url = base_ws_url + self.api_key = api_key + + async def connect_and_chat( + self, + session_id: str, + messages: list[dict[str, str]], + timeout: float = 10.0, + ) -> list[dict[str, Any]]: + """Connect to a chat WebSocket, send messages, collect responses. + + Args: + session_id: chat session ID + messages: list of {"type": "message", "content": "..."} + timeout: max seconds to wait for final_answer + + Returns: + list of all server-sent messages + """ + try: + import websockets + except ImportError: + pytest.skip("websockets package not installed") + + uri = f"{self.base_ws_url}/api/v1/chat/ws/{session_id}?api_key={self.api_key}" + received: list[dict[str, Any]] = [] + + async with websockets.connect(uri) as ws: + # Wait for connected event + msg = await asyncio.wait_for(ws.recv(), timeout=timeout) + data = json.loads(msg) + received.append(data) + assert data.get("type") == "connected", f"Expected connected, got {data}" + + # Send user messages + for user_msg in messages: + await ws.send(json.dumps(user_msg)) + + # Collect responses until final_answer or error + while True: + try: + raw = await asyncio.wait_for(ws.recv(), timeout=timeout) + resp = json.loads(raw) + received.append(resp) + + if resp.get("type") in ("final_answer", "error"): + break + except asyncio.TimeoutError: + received.append({"type": "timeout"}) + break + + return received + + +@pytest.fixture(scope="session") +def ws_helper(e2e_server: str) -> WSChatHelper: + """WebSocket chat helper for the E2E server.""" + ws_url = e2e_server.replace("http://", "ws://").replace("https://", "wss://") + return WSChatHelper(base_ws_url=ws_url, api_key=E2E_API_KEY) + + +# --------------------------------------------------------------------------- +# Skill / Agent setup helpers +# --------------------------------------------------------------------------- + + +def register_skill_via_api( + api_client: httpx.Client, + name: str, + keywords: list[str] | None = None, + execution_mode: str = "direct", + task_mode: str = "llm_generate", +) -> httpx.Response: + """Register a skill via the API for E2E testing.""" + config: dict[str, Any] = { + "name": name, + "agent_type": name, + "task_mode": task_mode, + "description": f"E2E test skill: {name}", + "prompt": { + "identity": f"You are a {name} assistant", + "instructions": f"Perform {name} tasks", + "output_format": "JSON", + }, + "intent": { + "keywords": keywords or [name], + "description": f"{name} skill for e2e testing", + }, + } + if execution_mode != "direct": + config["execution_mode"] = execution_mode + config["max_steps"] = 5 + + return api_client.post("/api/v1/skills", json={"config": config}) + + +def create_session_via_api(api_client: httpx.Client, agent_name: str = "test") -> str: + """Create a chat session and return the session ID.""" + resp = api_client.post("/api/v1/chat/sessions", json={"agent_name": agent_name}) + assert resp.status_code == 201, f"Failed to create session: {resp.text}" + return resp.json()["session_id"] + + +# --------------------------------------------------------------------------- +# Metrics Collector fixture +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def metrics_collector(request: pytest.FixtureRequest): + """Session-scoped metrics collector for capability analysis.""" + from tests.e2e.capability_metrics import MetricsCollector + + collector: MetricsCollector = request.config._e2e_metrics_collector # type: ignore[attr-defined] + return collector diff --git a/tests/e2e/test_basic_api.py b/tests/e2e/test_basic_api.py new file mode 100644 index 0000000..f053381 --- /dev/null +++ b/tests/e2e/test_basic_api.py @@ -0,0 +1,277 @@ +"""E2E Basic Function Tests — REST API endpoints. + +Verifies all API routes work correctly with proper request/response handling. + +Test categories: + 1. Health & metrics + 2. Agent CRUD lifecycle + 3. Skill registration & listing + 4. Task submission (sync/async/SSE) + 5. Chat session lifecycle + 6. LLM usage tracking + 7. Error handling & edge cases +""" + +import pytest +import httpx + +from tests.e2e.conftest import register_skill_via_api, create_session_via_api + + +# ═══════════════════════════════════════════════════════════════════════════ +# 1. Health & Metrics +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_basic +class TestHealthAPI: + def test_health_returns_ok(self, api_client: httpx.Client): + resp = api_client.get("/api/v1/health") + assert resp.status_code == 200 + data = resp.json() + assert data.get("status") in ("ok", "healthy") + + def test_metrics_endpoint(self, api_client: httpx.Client): + resp = api_client.get("/api/v1/metrics") + assert resp.status_code == 200 + + +# ═══════════════════════════════════════════════════════════════════════════ +# 2. Agent CRUD Lifecycle +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_basic +class TestAgentCRUD: + """Full Agent CRUD lifecycle: create → list → get → delete.""" + + def test_create_agent_from_skill(self, api_client: httpx.Client): + register_skill_via_api(api_client, "crud_skill", keywords=["crud"]) + resp = api_client.post("/api/v1/agents", json={"skill_name": "crud_skill"}) + assert resp.status_code == 201 + data = resp.json() + assert data["name"] == "crud_skill" + + def test_list_agents(self, api_client: httpx.Client): + register_skill_via_api(api_client, "list_skill", keywords=["list_agent"]) + api_client.post("/api/v1/agents", json={"skill_name": "list_skill"}) + resp = api_client.get("/api/v1/agents") + assert resp.status_code == 200 + agents = resp.json() + assert isinstance(agents, list) + assert any(a["name"] == "list_skill" for a in agents) + + def test_get_agent_detail(self, api_client: httpx.Client): + register_skill_via_api(api_client, "detail_skill", keywords=["detail"]) + api_client.post("/api/v1/agents", json={"skill_name": "detail_skill"}) + resp = api_client.get("/api/v1/agents/detail_skill") + assert resp.status_code == 200 + data = resp.json() + assert data["name"] == "detail_skill" + + def test_delete_agent(self, api_client: httpx.Client): + register_skill_via_api(api_client, "delete_skill", keywords=["delete_agent"]) + api_client.post("/api/v1/agents", json={"skill_name": "delete_skill"}) + resp = api_client.delete("/api/v1/agents/delete_skill") + assert resp.status_code == 204 + # Verify deleted + resp = api_client.get("/api/v1/agents/delete_skill") + assert resp.status_code == 404 + + def test_create_agent_nonexistent_skill(self, api_client: httpx.Client): + resp = api_client.post("/api/v1/agents", json={"skill_name": "nonexistent_skill_xyz"}) + assert resp.status_code in (400, 404) + + def test_get_nonexistent_agent(self, api_client: httpx.Client): + resp = api_client.get("/api/v1/agents/does_not_exist") + assert resp.status_code == 404 + + +# ═══════════════════════════════════════════════════════════════════════════ +# 3. Skill Registration & Listing +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_basic +class TestSkillAPI: + def test_register_skill(self, api_client: httpx.Client): + resp = register_skill_via_api(api_client, "reg_skill", keywords=["reg"]) + assert resp.status_code == 201 + + def test_list_skills(self, api_client: httpx.Client): + register_skill_via_api(api_client, "list_test_skill", keywords=["list_test"]) + resp = api_client.get("/api/v1/skills") + assert resp.status_code == 200 + skills = resp.json() + assert isinstance(skills, list) + assert len(skills) >= 1 + + def test_register_duplicate_skill(self, api_client: httpx.Client): + register_skill_via_api(api_client, "dup_skill", keywords=["dup"]) + resp = register_skill_via_api(api_client, "dup_skill", keywords=["dup"]) + # Should either overwrite or return conflict + assert resp.status_code in (200, 201, 409) + + def test_skill_with_execution_mode(self, api_client: httpx.Client): + resp = register_skill_via_api( + api_client, "react_skill", keywords=["react_test"], execution_mode="react" + ) + assert resp.status_code == 201 + + def test_skill_mention_suggest(self, api_client: httpx.Client): + register_skill_via_api(api_client, "mention_skill", keywords=["mention_test"]) + resp = api_client.get("/api/v1/skills/mention-suggest", params={"q": "mention"}) + assert resp.status_code == 200 + + +# ═══════════════════════════════════════════════════════════════════════════ +# 4. Task Submission +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_basic +class TestTaskAPI: + def test_submit_task_sync(self, api_client: httpx.Client): + register_skill_via_api(api_client, "sync_task_skill", keywords=["sync_task"]) + resp = api_client.post( + "/api/v1/tasks", + json={ + "input_data": {"query": "test sync task"}, + "skill_name": "sync_task_skill", + }, + ) + assert resp.status_code == 200 + data = resp.json() + assert "output" in data or "data" in data or "skill_name" in data + + def test_submit_task_with_agent_name(self, api_client: httpx.Client): + register_skill_via_api(api_client, "agent_task_skill", keywords=["agent_task"]) + api_client.post("/api/v1/agents", json={"skill_name": "agent_task_skill"}) + resp = api_client.post( + "/api/v1/tasks", + json={ + "input_data": {"query": "test agent task"}, + "agent_name": "agent_task_skill", + }, + ) + assert resp.status_code == 200 + + def test_submit_task_auto_route(self, api_client: httpx.Client): + register_skill_via_api(api_client, "auto_route_skill", keywords=["auto_route"]) + resp = api_client.post( + "/api/v1/tasks", + json={"input_data": {"query": "Please auto_route this for me"}}, + ) + assert resp.status_code == 200 + + def test_list_tasks(self, api_client: httpx.Client): + resp = api_client.get("/api/v1/tasks") + assert resp.status_code == 200 + + def test_submit_task_missing_data(self, api_client: httpx.Client): + resp = api_client.post("/api/v1/tasks", json={}) + # Should return 400 or 422 + assert resp.status_code in (400, 422) + + +# ═══════════════════════════════════════════════════════════════════════════ +# 5. Chat Session Lifecycle +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_basic +class TestChatSessionAPI: + def test_create_session(self, api_client: httpx.Client): + session_id = create_session_via_api(api_client) + assert session_id is not None + assert len(session_id) > 0 + + def test_list_sessions(self, api_client: httpx.Client): + create_session_via_api(api_client) + resp = api_client.get("/api/v1/chat/sessions") + assert resp.status_code == 200 + sessions = resp.json() + assert isinstance(sessions, list) + assert len(sessions) >= 1 + + def test_get_session(self, api_client: httpx.Client): + session_id = create_session_via_api(api_client) + resp = api_client.get(f"/api/v1/chat/sessions/{session_id}") + assert resp.status_code == 200 + + def test_session_messages(self, api_client: httpx.Client): + session_id = create_session_via_api(api_client) + # Send a message + resp = api_client.post( + f"/api/v1/chat/sessions/{session_id}/messages", + json={"content": "Hello from e2e test"}, + ) + assert resp.status_code == 200 + # Get messages + resp = api_client.get(f"/api/v1/chat/sessions/{session_id}/messages") + assert resp.status_code == 200 + + def test_close_session(self, api_client: httpx.Client): + session_id = create_session_via_api(api_client) + resp = api_client.delete(f"/api/v1/chat/sessions/{session_id}") + assert resp.status_code == 200 + + +# ═══════════════════════════════════════════════════════════════════════════ +# 6. LLM Usage Tracking +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_basic +class TestLLMUsageAPI: + def test_llm_usage_endpoint(self, api_client: httpx.Client): + resp = api_client.get("/api/v1/llm/usage") + assert resp.status_code == 200 + + def test_llm_usage_after_task(self, api_client: httpx.Client): + register_skill_via_api(api_client, "usage_track_skill", keywords=["usage_track"]) + api_client.post( + "/api/v1/tasks", + json={ + "input_data": {"query": "test usage tracking"}, + "skill_name": "usage_track_skill", + }, + ) + resp = api_client.get("/api/v1/llm/usage") + assert resp.status_code == 200 + + +# ═══════════════════════════════════════════════════════════════════════════ +# 7. Error Handling & Edge Cases +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_basic +class TestAPIErrorHandling: + def test_404_for_unknown_route(self, api_client: httpx.Client): + resp = api_client.get("/api/v1/nonexistent_route") + assert resp.status_code == 404 + + def test_invalid_json_body(self, api_client: httpx.Client): + resp = api_client.post( + "/api/v1/tasks", + content=b"not json", + headers={"Content-Type": "application/json"}, + ) + assert resp.status_code in (400, 422) + + def test_missing_api_key(self, e2e_server: str): + """Requests without API key should be rejected (if auth enabled).""" + client = httpx.Client(base_url=e2e_server, timeout=10) + resp = client.get("/api/v1/agents") + # Should be 401/403 or still 200 if auth is not enforced on this endpoint + assert resp.status_code in (200, 401, 403) + + def test_invalid_api_key(self, e2e_server: str): + client = httpx.Client( + base_url=e2e_server, + headers={"X-API-Key": "invalid_key"}, + timeout=10, + ) + resp = client.get("/api/v1/agents") + assert resp.status_code in (200, 401, 403) diff --git a/tests/e2e/test_basic_cli.py b/tests/e2e/test_basic_cli.py new file mode 100644 index 0000000..79688a3 --- /dev/null +++ b/tests/e2e/test_basic_cli.py @@ -0,0 +1,353 @@ +"""E2E Basic Function Tests — CLI commands. + +Verifies that all CLI commands execute correctly as a real user would invoke them. +Uses subprocess (OpenCLI pattern) to simulate actual CLI operations. + +Test categories: + 1. Utility commands: version, doctor, help + 2. Init & config: agentkit init + 3. Pair: API key generation + 4. Skill management: list, load, info + 5. Task management: submit, status, list, cancel + 6. Server: serve startup +""" + +import json +import os + +import pytest + +from tests.e2e.conftest import CLIRunner, E2E_BASE_URL + + +# ═══════════════════════════════════════════════════════════════════════════ +# 1. Utility Commands +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_basic +class TestCLIVersion: + """agentkit version — basic sanity check.""" + + def test_version_returns_zero_exit_code(self, cli_runner: CLIRunner): + result = cli_runner.run(["version"]) + assert result.returncode == 0, f"stdout: {result.stdout}\nstderr: {result.stderr}" + + def test_version_outputs_version_string(self, cli_runner: CLIRunner): + result = cli_runner.run(["version"]) + assert "0.1.0" in result.stdout or "fischer-agentkit" in result.stdout.lower() + + def test_version_help(self, cli_runner: CLIRunner): + result = cli_runner.run(["version", "--help"]) + assert result.returncode == 0 + + +@pytest.mark.e2e_basic +class TestCLIDoctor: + """agentkit doctor — server health check.""" + + def test_doctor_server_not_running(self, cli_runner: CLIRunner): + """Doctor should report error when no server is running.""" + result = cli_runner.run(["doctor"]) + # Should indicate server not reachable + output = (result.stdout + result.stderr).lower() + assert ( + result.returncode != 0 + or "not running" in output + or "error" in output + or "connection" in output + ) + + def test_doctor_with_running_server(self, cli_runner_session: CLIRunner): + """Doctor should report healthy when E2E server is running.""" + result = cli_runner_session.run(["doctor", "--port", "18765"]) + output = (result.stdout + result.stderr).lower() + # Should show some health info (ok, healthy, or at least not connection refused) + assert "connection refused" not in output or result.returncode == 0 + + +@pytest.mark.e2e_basic +class TestCLIHelp: + """agentkit --help — command discovery.""" + + def test_help_shows_all_subcommands(self, cli_runner: CLIRunner): + result = cli_runner.run(["--help"]) + assert result.returncode == 0 + for cmd in [ + "serve", + "gui", + "chat", + "version", + "doctor", + "init", + "task", + "skill", + "usage", + "pair", + ]: + assert cmd in result.stdout, f"Missing subcommand '{cmd}' in help output" + + def test_task_help(self, cli_runner: CLIRunner): + result = cli_runner.run(["task", "--help"]) + assert result.returncode == 0 + for sub in ["submit", "status", "list", "cancel"]: + assert sub in result.stdout + + def test_skill_help(self, cli_runner: CLIRunner): + result = cli_runner.run(["skill", "--help"]) + assert result.returncode == 0 + for sub in ["list", "load", "info"]: + assert sub in result.stdout + + +# ═══════════════════════════════════════════════════════════════════════════ +# 2. Init & Config +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_basic +class TestCLIInit: + """agentkit init — project initialization.""" + + def test_init_non_interactive(self, cli_runner: CLIRunner, tmp_path): + output_dir = str(tmp_path / "init_output") + os.makedirs(output_dir, exist_ok=True) + result = cli_runner.run(["init", "--non-interactive", "--output-dir", output_dir]) + assert result.returncode == 0, f"stderr: {result.stderr}" + assert os.path.exists(os.path.join(output_dir, "agentkit.yaml")) + assert os.path.exists(os.path.join(output_dir, ".env.example")) + + def test_init_generates_valid_yaml(self, cli_runner: CLIRunner, tmp_path): + import yaml + + output_dir = str(tmp_path / "init_yaml") + os.makedirs(output_dir, exist_ok=True) + cli_runner.run(["init", "--non-interactive", "--output-dir", output_dir]) + with open(os.path.join(output_dir, "agentkit.yaml")) as f: + config = yaml.safe_load(f) + assert "server" in config + assert "llm" in config + + def test_init_no_overwrite_without_force(self, cli_runner: CLIRunner, tmp_path): + output_dir = str(tmp_path / "init_no_overwrite") + os.makedirs(output_dir, exist_ok=True) + # Create existing file + with open(os.path.join(output_dir, "agentkit.yaml"), "w") as f: + f.write("existing_content") + cli_runner.run(["init", "--non-interactive", "--output-dir", output_dir]) + with open(os.path.join(output_dir, "agentkit.yaml")) as f: + content = f.read() + # Should not overwrite + assert content == "existing_content" + + def test_init_force_overwrites(self, cli_runner: CLIRunner, tmp_path): + output_dir = str(tmp_path / "init_force") + os.makedirs(output_dir, exist_ok=True) + with open(os.path.join(output_dir, "agentkit.yaml"), "w") as f: + f.write("old") + result = cli_runner.run( + ["init", "--non-interactive", "--force", "--output-dir", output_dir] + ) + assert result.returncode == 0 + with open(os.path.join(output_dir, "agentkit.yaml")) as f: + content = f.read() + assert "server" in content + + +# ═══════════════════════════════════════════════════════════════════════════ +# 3. Pair (API Key Generation) +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_basic +class TestCLIPair: + """agentkit pair — external system API key management.""" + + def test_pair_generates_api_key(self, cli_runner: CLIRunner, tmp_path): + config_dir = str(tmp_path / "pair_config") + os.makedirs(config_dir, exist_ok=True) + result = cli_runner.run(["pair", "--name", "e2e-test-client", "--config-dir", config_dir]) + assert result.returncode == 0, f"stderr: {result.stderr}" + assert "ak_live_" in result.stdout + + def test_pair_saves_client_config(self, cli_runner: CLIRunner, tmp_path): + import yaml + + config_dir = str(tmp_path / "pair_save") + os.makedirs(config_dir, exist_ok=True) + cli_runner.run(["pair", "--name", "e2e-client", "--config-dir", config_dir]) + clients_path = os.path.join(config_dir, "clients.yaml") + assert os.path.exists(clients_path) + with open(clients_path) as f: + clients = yaml.safe_load(f) + assert "e2e-client" in clients + assert clients["e2e-client"]["api_key"].startswith("ak_live_") + + def test_pair_rejects_duplicate_name(self, cli_runner: CLIRunner, tmp_path): + config_dir = str(tmp_path / "pair_dup") + os.makedirs(config_dir, exist_ok=True) + cli_runner.run(["pair", "--name", "dup-client", "--config-dir", config_dir]) + result = cli_runner.run(["pair", "--name", "dup-client", "--config-dir", config_dir]) + output = (result.stdout + result.stderr).lower() + assert result.returncode != 0 or "already" in output or "exists" in output + + def test_pair_list(self, cli_runner: CLIRunner, tmp_path): + config_dir = str(tmp_path / "pair_list") + os.makedirs(config_dir, exist_ok=True) + cli_runner.run(["pair", "--name", "client-a", "--config-dir", config_dir]) + cli_runner.run(["pair", "--name", "client-b", "--config-dir", config_dir]) + result = cli_runner.run(["pair", "--list", "--config-dir", config_dir]) + assert result.returncode == 0 + assert "client-a" in result.stdout + assert "client-b" in result.stdout + + def test_pair_revoke(self, cli_runner: CLIRunner, tmp_path): + import yaml + + config_dir = str(tmp_path / "pair_revoke") + os.makedirs(config_dir, exist_ok=True) + cli_runner.run(["pair", "--name", "revoke-me", "--config-dir", config_dir]) + result = cli_runner.run(["pair", "--revoke", "revoke-me", "--config-dir", config_dir]) + assert result.returncode == 0 + with open(os.path.join(config_dir, "clients.yaml")) as f: + clients = yaml.safe_load(f) + assert "revoke-me" not in clients + + +# ═══════════════════════════════════════════════════════════════════════════ +# 4. Skill Management (CLI → Server) +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_basic +class TestCLISkill: + """agentkit skill — skill management via CLI against running server.""" + + def test_skill_list_via_server(self, cli_runner_session: CLIRunner): + result = cli_runner_session.run_server_command(["skill", "list"], E2E_BASE_URL) + assert result.returncode == 0, f"stderr: {result.stderr}" + + def test_skill_load_yaml(self, cli_runner: CLIRunner, tmp_path): + import yaml + + skill_file = tmp_path / "test_skill.yaml" + skill_file.write_text( + yaml.dump( + { + "name": "e2e_test_skill", + "description": "E2E test skill", + "agent_type": "assistant", + "task_mode": "llm_generate", + "prompt": {"system": "You are a test assistant"}, + } + ) + ) + result = cli_runner.run(["skill", "load", str(skill_file)]) + # Should load successfully or report loaded + output = (result.stdout + result.stderr).lower() + assert result.returncode == 0 or "loaded" in output or "e2e_test_skill" in output + + def test_skill_info_via_server(self, cli_runner_session: CLIRunner, api_client): + # First register a skill via API + from tests.e2e.conftest import register_skill_via_api + + register_skill_via_api(api_client, "cli_info_skill", keywords=["cli_info"]) + # Then query via CLI + result = cli_runner_session.run_server_command( + ["skill", "info", "cli_info_skill"], E2E_BASE_URL + ) + assert result.returncode == 0 + assert "cli_info_skill" in result.stdout + + +# ═══════════════════════════════════════════════════════════════════════════ +# 5. Task Management (CLI → Server) +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_basic +class TestCLITask: + """agentkit task — task management via CLI against running server.""" + + def test_task_submit_sync(self, cli_runner_session: CLIRunner, api_client): + from tests.e2e.conftest import register_skill_via_api + + register_skill_via_api(api_client, "cli_task_skill", keywords=["cli_task"]) + result = cli_runner_session.run_server_command( + [ + "task", + "submit", + "--skill", + "cli_task_skill", + "--input", + json.dumps({"query": "test task submission"}), + ], + E2E_BASE_URL, + ) + assert result.returncode == 0, f"stdout: {result.stdout}\nstderr: {result.stderr}" + + def test_task_submit_async(self, cli_runner_session: CLIRunner, api_client): + from tests.e2e.conftest import register_skill_via_api + + register_skill_via_api(api_client, "cli_async_skill", keywords=["cli_async"]) + result = cli_runner_session.run_server_command( + [ + "task", + "submit", + "--skill", + "cli_async_skill", + "--mode", + "async", + "--input", + json.dumps({"query": "async task test"}), + ], + E2E_BASE_URL, + ) + assert result.returncode == 0 + + def test_task_list(self, cli_runner_session: CLIRunner): + result = cli_runner_session.run_server_command(["task", "list"], E2E_BASE_URL) + assert result.returncode == 0 + + def test_task_submit_input_file(self, cli_runner_session: CLIRunner, api_client, tmp_path): + from tests.e2e.conftest import register_skill_via_api + + register_skill_via_api(api_client, "cli_file_skill", keywords=["cli_file"]) + + input_file = tmp_path / "task_input.json" + input_file.write_text(json.dumps({"query": "file input test"})) + + result = cli_runner_session.run_server_command( + [ + "task", + "submit", + "--skill", + "cli_file_skill", + "--input-file", + str(input_file), + ], + E2E_BASE_URL, + ) + assert result.returncode == 0 + + +# ═══════════════════════════════════════════════════════════════════════════ +# 6. Server Startup +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_basic +class TestCLIServe: + """agentkit serve — server startup (basic check, not full lifecycle).""" + + def test_serve_help(self, cli_runner: CLIRunner): + result = cli_runner.run(["serve", "--help"]) + assert result.returncode == 0 + assert "--host" in result.stdout + assert "--port" in result.stdout + + def test_serve_invalid_port(self, cli_runner: CLIRunner): + """Serve with an invalid port should fail gracefully.""" + result = cli_runner.run(["serve", "--port", "not_a_port"], timeout=5) + # Should error out, not hang + assert result.returncode != 0 or "error" in (result.stdout + result.stderr).lower() diff --git a/tests/e2e/test_basic_websocket.py b/tests/e2e/test_basic_websocket.py new file mode 100644 index 0000000..dabc4f7 --- /dev/null +++ b/tests/e2e/test_basic_websocket.py @@ -0,0 +1,170 @@ +"""E2E Basic Function Tests — WebSocket chat protocol. + +Verifies the WebSocket chat protocol works correctly: + 1. Connection lifecycle (connect → connected → ping/pong → disconnect) + 2. Message exchange (user message → token stream → final_answer) + 3. Confirmation flow (confirmation_request → confirmation_reply → confirmation_result) + 4. AskHuman flow (ask_human → reply → continue) + 5. Cancel flow (cancel → error/cancelled) + 6. Expert team events (team_formed → expert_step → team_synthesis → team_dissolved) +""" + +import asyncio +import json + +import pytest + +from tests.e2e.conftest import WSChatHelper, create_session_via_api, register_skill_via_api + + +# ═══════════════════════════════════════════════════════════════════════════ +# 1. Connection Lifecycle +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_basic +class TestWSConnection: + @pytest.mark.asyncio + async def test_connect_receives_connected_event(self, ws_helper: WSChatHelper, api_client): + session_id = create_session_via_api(api_client) + messages = await ws_helper.connect_and_chat(session_id, []) + assert len(messages) >= 1 + assert messages[0].get("type") == "connected" + + @pytest.mark.asyncio + async def test_ping_pong(self, ws_helper: WSChatHelper, api_client): + """Ping should receive pong response.""" + try: + import websockets + except ImportError: + pytest.skip("websockets not installed") + + session_id = create_session_via_api(api_client) + uri = f"{ws_helper.base_ws_url}/api/v1/chat/ws/{session_id}?api_key={ws_helper.api_key}" + + received: list[dict] = [] + async with websockets.connect(uri) as ws: + # Wait for connected + msg = await asyncio.wait_for(ws.recv(), timeout=10) + received.append(json.loads(msg)) + + # Send ping + await ws.send(json.dumps({"type": "ping"})) + raw = await asyncio.wait_for(ws.recv(), timeout=10) + resp = json.loads(raw) + assert resp.get("type") == "pong" + + @pytest.mark.asyncio + async def test_invalid_session_id(self, ws_helper: WSChatHelper): + """Connecting with invalid session ID should fail.""" + try: + import websockets + except ImportError: + pytest.skip("websockets not installed") + + uri = f"{ws_helper.base_ws_url}/api/v1/chat/ws/nonexistent-session?api_key={ws_helper.api_key}" + with pytest.raises(Exception): + async with websockets.connect(uri) as ws: + await asyncio.wait_for(ws.recv(), timeout=5) + + +# ═══════════════════════════════════════════════════════════════════════════ +# 2. Message Exchange +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_basic +class TestWSMessageExchange: + @pytest.mark.asyncio + async def test_send_message_get_response(self, ws_helper: WSChatHelper, api_client): + session_id = create_session_via_api(api_client) + messages = await ws_helper.connect_and_chat( + session_id, + [{"type": "message", "content": "Hello, this is an e2e test"}], + ) + # Should receive at least: connected + some response (token/final_answer/error) + assert len(messages) >= 2 + # Last meaningful message should be final_answer or error + response_types = [m.get("type") for m in messages] + assert any(t in response_types for t in ("final_answer", "error", "token", "thinking")) + + @pytest.mark.asyncio + async def test_message_types_are_valid(self, ws_helper: WSChatHelper, api_client): + """All server-sent messages should have a valid 'type' field.""" + session_id = create_session_via_api(api_client) + messages = await ws_helper.connect_and_chat( + session_id, + [{"type": "message", "content": "Test valid message types"}], + ) + valid_types = { + "connected", + "token", + "thinking", + "step", + "final_answer", + "skill_match", + "confirmation_request", + "confirmation_result", + "ask_human", + "error", + "pong", + "team_formed", + "expert_step", + "expert_result", + "plan_update", + "team_synthesis", + "team_dissolved", + } + for msg in messages: + if isinstance(msg, dict) and "type" in msg: + assert msg["type"] in valid_types, f"Invalid message type: {msg['type']}" + + +# ═══════════════════════════════════════════════════════════════════════════ +# 3. Cancel Flow +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_basic +class TestWSCancel: + @pytest.mark.asyncio + async def test_cancel_message_accepted(self, ws_helper: WSChatHelper, api_client): + """Sending cancel should be accepted by the server.""" + try: + import websockets + except ImportError: + pytest.skip("websockets not installed") + + session_id = create_session_via_api(api_client) + uri = f"{ws_helper.base_ws_url}/api/v1/chat/ws/{session_id}?api_key={ws_helper.api_key}" + + async with websockets.connect(uri) as ws: + # Wait for connected + await asyncio.wait_for(ws.recv(), timeout=10) + # Send a message first + await ws.send(json.dumps({"type": "message", "content": "Start a task"})) + # Immediately send cancel + await ws.send(json.dumps({"type": "cancel"})) + # Server should handle gracefully (no crash) + + +# ═══════════════════════════════════════════════════════════════════════════ +# 4. Skill Match Event +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_basic +class TestWSSkillMatch: + @pytest.mark.asyncio + async def test_skill_match_notification(self, ws_helper: WSChatHelper, api_client): + """When a skill is matched, server should send skill_match event.""" + register_skill_via_api(api_client, "ws_skill", keywords=["ws_skill_match"]) + session_id = create_session_via_api(api_client) + messages = await ws_helper.connect_and_chat( + session_id, + [{"type": "message", "content": "Please use ws_skill_match for this"}], + ) + # Check if skill_match event was sent (may or may not happen depending on routing) + _ = [m.get("type") for m in messages] # noqa: F841 + # At minimum, we should get a response (skill_match or direct answer) + assert len(messages) >= 2 diff --git a/tests/e2e/test_capability_alignment.py b/tests/e2e/test_capability_alignment.py new file mode 100644 index 0000000..25c45f4 --- /dev/null +++ b/tests/e2e/test_capability_alignment.py @@ -0,0 +1,305 @@ +"""E2E capability tests for AlignmentGuard and CascadeDetector. + +Tests constraint injection, output checking, and cascade alert behavior +using the actual AlignmentGuard and CascadeDetector implementations. +""" + +from __future__ import annotations + +from datetime import datetime, timezone + +import pytest + +from agentkit.quality.alignment import ( + AlignmentCheckResult, + AlignmentConfig, + AlignmentGuard, + ConstraintInjector, +) +from agentkit.quality.cascade_detector import CascadeDetector + +from tests.e2e.benchmark_dataset import BenchmarkCase +from tests.e2e.capability_metrics import CapabilityObservation, MetricsCollector + + +# --------------------------------------------------------------------------- +# Alignment benchmark cases +# --------------------------------------------------------------------------- + + +ALIGNMENT_BENCHMARKS: list[BenchmarkCase] = [ + # Negative constraints: "不要X" → output should NOT contain X + BenchmarkCase( + id="align-neg-001", + input="请分析市场趋势,不要提及价格信息", + expected_skill=None, + expected_execution_mode="react", + expected_complexity="high", + category="alignment", + subcategory="negative_constraint", + ), + BenchmarkCase( + id="align-neg-002", + input="总结这篇文章,禁止包含个人观点", + expected_skill=None, + expected_execution_mode="react", + expected_complexity="medium", + category="alignment", + subcategory="negative_constraint", + ), + # Positive constraints: "必须X" → output SHOULD contain X + BenchmarkCase( + id="align-pos-001", + input="分析竞争对手,必须包含摘要部分", + expected_skill=None, + expected_execution_mode="react", + expected_complexity="high", + category="alignment", + subcategory="positive_constraint", + ), + BenchmarkCase( + id="align-pos-002", + input="审查代码,需要提供改进建议", + expected_skill=None, + expected_execution_mode="react", + expected_complexity="medium", + category="alignment", + subcategory="positive_constraint", + ), + # Cascade alert: repeated interactions should trigger alert + BenchmarkCase( + id="align-cascade-001", + input="重复执行相似查询触发级联告警", + expected_skill=None, + expected_execution_mode="react", + expected_complexity="medium", + category="alignment", + subcategory="cascade_alert", + ), + # No constraints: should pass cleanly + BenchmarkCase( + id="align-none-001", + input="帮我分析一下用户数据", + expected_skill=None, + expected_execution_mode="react", + expected_complexity="medium", + category="alignment", + subcategory="no_constraint", + ), +] + + +# --------------------------------------------------------------------------- +# Tests: ConstraintInjector +# --------------------------------------------------------------------------- + + +class TestConstraintInjector: + def test_inject_constraints(self) -> None: + config = AlignmentConfig(constraints=["不要提及价格", "必须包含摘要"]) + injector = ConstraintInjector(config) + input_data = {"query": "分析市场趋势"} + result = injector.inject(input_data) + assert "alignment_constraints" in result + assert result["alignment_constraints"] == ["不要提及价格", "必须包含摘要"] + # Original data should not be modified + assert "alignment_constraints" not in input_data + + +# --------------------------------------------------------------------------- +# Tests: AlignmentGuard rule-based checking +# --------------------------------------------------------------------------- + + +class TestAlignmentGuardRuleCheck: + @pytest.fixture + def guard(self) -> AlignmentGuard: + config = AlignmentConfig( + constraints=["不要提及价格信息", "必须摘要"], + audit_enabled=False, + ) + return AlignmentGuard(config) + + @pytest.mark.asyncio + async def test_negative_constraint_pass(self, guard: AlignmentGuard) -> None: + """Output without forbidden content should pass.""" + output = {"content": "市场趋势分析:整体呈上升趋势。摘要:市场表现良好。"} + result = await guard.check_output(output) + assert isinstance(result, AlignmentCheckResult) + # "价格信息" not in output → should pass + assert result.passed is True + + @pytest.mark.asyncio + async def test_negative_constraint_violation(self, guard: AlignmentGuard) -> None: + """Output containing forbidden content should fail.""" + output = {"content": "当前提及价格信息显示市场上涨。摘要:市场持续走高。"} + result = await guard.check_output(output) + assert result.passed is False + assert len(result.violations) > 0 + + @pytest.mark.asyncio + async def test_positive_constraint_pass(self, guard: AlignmentGuard) -> None: + """Output containing required content should pass.""" + output = {"content": "分析结果如下。摘要:市场趋势向好。"} + result = await guard.check_output(output) + assert result.passed is True + + @pytest.mark.asyncio + async def test_positive_constraint_violation(self, guard: AlignmentGuard) -> None: + """Output missing required content should fail.""" + output = {"content": "分析结果如下。市场趋势向好。"} + result = await guard.check_output(output) + assert result.passed is False + + @pytest.mark.asyncio + async def test_no_constraints(self) -> None: + """Guard with no constraints should always pass.""" + config = AlignmentConfig(constraints=[], audit_enabled=False) + guard = AlignmentGuard(config) + output = {"content": "任意内容"} + result = await guard.check_output(output) + assert result.passed is True + + @pytest.mark.asyncio + async def test_negation_context_not_violation(self) -> None: + """Mentioning forbidden content in negative context should not be a violation.""" + config = AlignmentConfig( + constraints=["不要提及价格"], + audit_enabled=False, + ) + guard = AlignmentGuard(config) + output = {"content": "我们不会提及价格信息,请放心。摘要:市场分析完成。"} + result = await guard.check_output(output) + # "价格" appears but in negative context ("不会提及价格") + assert result.passed is True + + +# --------------------------------------------------------------------------- +# Tests: CascadeDetector +# --------------------------------------------------------------------------- + + +class TestCascadeDetector: + def test_no_alert_below_threshold(self) -> None: + detector = CascadeDetector(max_interactions=5) + for _ in range(5): + alert = detector.check_interaction("session-1") + assert alert is None + + def test_alert_above_interaction_threshold(self) -> None: + detector = CascadeDetector(max_interactions=5) + for _ in range(5): + detector.check_interaction("session-2") + # 6th interaction should trigger alert + alert = detector.check_interaction("session-2") + assert alert is not None + assert alert.alert_type == "interaction_limit" + assert alert.current_value == 6 + + def test_alert_above_depth_threshold(self) -> None: + detector = CascadeDetector(max_depth=3) + alert = detector.check_depth("session-3", 4) + assert alert is not None + assert alert.alert_type == "loop_depth" + assert alert.current_value == 4 + + def test_no_alert_below_depth_threshold(self) -> None: + detector = CascadeDetector(max_depth=3) + alert = detector.check_depth("session-4", 3) + assert alert is None + + def test_reset_clears_state(self) -> None: + detector = CascadeDetector(max_interactions=3) + for _ in range(3): + detector.check_interaction("session-5") + detector.reset("session-5") + # After reset, count should be back to 0 + alert = detector.check_interaction("session-5") + assert alert is None # count is now 1, below threshold + + +# --------------------------------------------------------------------------- +# Tests: AlignmentGuard cascade integration +# --------------------------------------------------------------------------- + + +class TestAlignmentGuardCascade: + def test_record_interaction_returns_alert(self) -> None: + config = AlignmentConfig(cascade_max_interactions=3) + guard = AlignmentGuard(config) + for _ in range(3): + guard.record_interaction("session-10") + alert = guard.record_interaction("session-10") + assert alert is not None + assert alert.alert_type == "interaction_limit" + + def test_record_loop_depth_returns_alert(self) -> None: + config = AlignmentConfig(cascade_max_depth=2) + guard = AlignmentGuard(config) + alert = guard.record_loop_depth("session-11", 3) + assert alert is not None + assert alert.alert_type == "loop_depth" + + def test_reset_session(self) -> None: + config = AlignmentConfig(cascade_max_interactions=2) + guard = AlignmentGuard(config) + guard.record_interaction("session-12") + guard.record_interaction("session-12") + guard.reset_session("session-12") + assert guard.get_interaction_count("session-12") == 0 + + +# --------------------------------------------------------------------------- +# Tests: Metrics collection for alignment +# --------------------------------------------------------------------------- + + +class TestAlignmentMetricsCollection: + def test_record_alignment_observation(self) -> None: + collector = MetricsCollector() + obs = CapabilityObservation( + benchmark_id="align-neg-001", + test_name="test_neg_constraint", + timestamp=datetime.now(timezone.utc).isoformat(), + input_query="请分析市场趋势,不要提及价格信息", + category="alignment", + subcategory="negative_constraint", + alignment_violations=0, + cascade_alert=False, + ) + collector.record(obs) + alignment_obs = collector.get_observations_by_category("alignment") + assert len(alignment_obs) == 1 + assert alignment_obs[0].alignment_violations == 0 + + def test_record_alignment_with_violations(self) -> None: + collector = MetricsCollector() + obs = CapabilityObservation( + benchmark_id="align-neg-002", + test_name="test_neg_constraint_violation", + timestamp=datetime.now(timezone.utc).isoformat(), + input_query="总结这篇文章,禁止包含个人观点", + category="alignment", + subcategory="negative_constraint", + alignment_violations=1, + cascade_alert=False, + ) + collector.record(obs) + alignment_obs = collector.get_observations_by_category("alignment") + assert alignment_obs[0].alignment_violations == 1 + + def test_record_cascade_alert(self) -> None: + collector = MetricsCollector() + obs = CapabilityObservation( + benchmark_id="align-cascade-001", + test_name="test_cascade_alert", + timestamp=datetime.now(timezone.utc).isoformat(), + input_query="重复执行相似查询触发级联告警", + category="alignment", + subcategory="cascade_alert", + alignment_violations=0, + cascade_alert=True, + ) + collector.record(obs) + alignment_obs = collector.get_observations_by_category("alignment") + assert alignment_obs[0].cascade_alert is True diff --git a/tests/e2e/test_capability_react.py b/tests/e2e/test_capability_react.py new file mode 100644 index 0000000..6eb05e7 --- /dev/null +++ b/tests/e2e/test_capability_react.py @@ -0,0 +1,324 @@ +"""E2E Agent Capability Tests — ReAct Reasoning & Execution with Metrics. + +Tests the intelligence of agent execution AND collects data for: + - Execution mode selection accuracy + - Quality gate effectiveness + - Task success rate by mode + - Output standardization consistency + - Overfitting detection via paraphrased inputs +""" + +import pytest +import httpx + +from tests.e2e.benchmark_dataset import EXECUTION_BENCHMARKS, BenchmarkCase +from tests.e2e.capability_metrics import MetricsCollector +from tests.e2e.conftest import register_skill_via_api + + +# ═══════════════════════════════════════════════════════════════════════════ +# Helper: run execution benchmark and record metrics +# ═══════════════════════════════════════════════════════════════════════════ + + +def _run_exec_benchmark( + benchmark: BenchmarkCase, + api_client: httpx.Client, + collector: MetricsCollector, + test_name: str, + is_paraphrase: bool = False, + input_override: str | None = None, +) -> dict: + """Execute an execution benchmark and record metrics.""" + query = input_override or benchmark.input + collector.start_timer(benchmark.id) + + payload: dict = {"input_data": {"query": query}} + if benchmark.expected_skill is not None: + payload["skill_name"] = benchmark.expected_skill + + resp = api_client.post("/api/v1/tasks", json=payload) + + actual_skill = None + actual_exec_mode = None + actual_keys = [] + task_succeeded = resp.status_code == 200 + error_msg = None + + if task_succeeded: + data = resp.json() + actual_skill = data.get("skill_name") + actual_exec_mode = data.get("execution_mode") + actual_keys = list(data.keys()) + elif resp.status_code >= 400: + try: + error_msg = resp.json().get("detail", resp.text[:200]) + except Exception: + error_msg = resp.text[:200] + + collector.record_benchmark_result( + benchmark, + test_name=test_name, + actual_skill=actual_skill, + actual_execution_mode=actual_exec_mode, + actual_status_code=resp.status_code, + actual_response_keys=actual_keys, + task_succeeded=task_succeeded, + is_paraphrase=is_paraphrase, + error_message=error_msg, + ) + + return { + "status_code": resp.status_code, + "actual_skill": actual_skill, + "actual_exec_mode": actual_exec_mode, + "actual_keys": actual_keys, + "task_succeeded": task_succeeded, + } + + +# ═══════════════════════════════════════════════════════════════════════════ +# Parameterized Execution Benchmark Tests +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_capability +class TestExecutionBenchmarks: + """Run all execution benchmarks with metrics collection.""" + + @pytest.mark.parametrize( + "benchmark", + EXECUTION_BENCHMARKS, + ids=[b.id for b in EXECUTION_BENCHMARKS], + ) + def test_execution_benchmark( + self, + benchmark: BenchmarkCase, + api_client: httpx.Client, + metrics_collector: MetricsCollector, + ): + """Run original execution benchmark and record metrics.""" + # Register the skill if expected + if benchmark.expected_skill: + exec_mode = ( + benchmark.expected_execution_mode + if benchmark.expected_execution_mode != "direct" + else "direct" + ) + register_skill_via_api( + api_client, + benchmark.expected_skill, + keywords=[benchmark.expected_skill], + execution_mode=exec_mode, + ) + + result = _run_exec_benchmark( + benchmark, + api_client, + metrics_collector, + test_name=f"exec_benchmark_{benchmark.id}", + ) + assert result["status_code"] == 200, f"Benchmark {benchmark.id} failed" + + @pytest.mark.parametrize( + "benchmark", + [b for b in EXECUTION_BENCHMARKS if b.paraphrases], + ids=[b.id for b in EXECUTION_BENCHMARKS if b.paraphrases], + ) + def test_execution_paraphrase( + self, + benchmark: BenchmarkCase, + api_client: httpx.Client, + metrics_collector: MetricsCollector, + ): + """Run paraphrases for overfitting detection.""" + for i, paraphrase in enumerate(benchmark.paraphrases): + _run_exec_benchmark( + benchmark, + api_client, + metrics_collector, + test_name=f"exec_paraphrase_{benchmark.id}_{i}", + is_paraphrase=True, + input_override=paraphrase, + ) + + +# ═══════════════════════════════════════════════════════════════════════════ +# ReAct Loop Intelligence +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_capability +class TestReActIntelligence: + """Test that ReAct agents reason correctly through Think→Act→Observe.""" + + def test_react_skill_executes_steps( + self, + api_client: httpx.Client, + metrics_collector: MetricsCollector, + ): + """ReAct skill should execute multiple steps for complex tasks.""" + benchmark = BenchmarkCase( + id="react-steps-001", + input="Research and analyze the impact of AI on healthcare", + expected_skill="react_reasoner", + expected_execution_mode="react", + expected_complexity="high", + category="execution", + subcategory="react_mode", + paraphrases=["Investigate AI's effect on medical industry", "调研AI对医疗行业的影响"], + ) + register_skill_via_api( + api_client, + "react_reasoner", + keywords=["react_reason", "research", "analyze"], + execution_mode="react", + ) + + result = _run_exec_benchmark( + benchmark, + api_client, + metrics_collector, + test_name="react_steps", + ) + assert result["status_code"] == 200 + + for i, para in enumerate(benchmark.paraphrases): + _run_exec_benchmark( + benchmark, + api_client, + metrics_collector, + test_name=f"react_steps_para_{i}", + is_paraphrase=True, + input_override=para, + ) + + +# ═══════════════════════════════════════════════════════════════════════════ +# Quality Gate Intelligence +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_capability +class TestQualityGateIntelligence: + """Test that quality gate correctly validates and retries outputs.""" + + def test_quality_gate_with_required_fields( + self, + api_client: httpx.Client, + metrics_collector: MetricsCollector, + ): + """Quality gate should enforce required_fields in output.""" + benchmark = BenchmarkCase( + id="quality-fields-001", + input="Generate content with quality check", + expected_skill="quality_skill", + expected_complexity="medium", + category="execution", + subcategory="quality_gate", + ) + register_skill_via_api(api_client, "quality_skill", keywords=["quality_test"]) + + result = _run_exec_benchmark( + benchmark, + api_client, + metrics_collector, + test_name="quality_fields", + ) + assert result["status_code"] in (200, 400, 422) + + def test_quality_gate_rejects_empty_output( + self, + api_client: httpx.Client, + metrics_collector: MetricsCollector, + ): + """Quality gate should reject empty or minimal output.""" + benchmark = BenchmarkCase( + id="quality-empty-001", + input="", + expected_skill="quality_empty", + expected_complexity="low", + category="execution", + subcategory="quality_gate", + ) + register_skill_via_api(api_client, "quality_empty", keywords=["quality_empty"]) + + result = _run_exec_benchmark( + benchmark, + api_client, + metrics_collector, + test_name="quality_empty", + ) + # Should handle gracefully + assert result["status_code"] in (200, 400, 422) + + +# ═══════════════════════════════════════════════════════════════════════════ +# Output Standardization Intelligence +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_capability +class TestOutputStandardization: + """Test that agent outputs are properly standardized.""" + + def test_output_has_required_structure( + self, + api_client: httpx.Client, + metrics_collector: MetricsCollector, + ): + """Task results should have a consistent structure.""" + register_skill_via_api(api_client, "output_std_skill", keywords=["output_std"]) + + benchmark = BenchmarkCase( + id="output-std-001", + input="Test output standardization", + expected_skill="output_std_skill", + expected_complexity="low", + category="execution", + subcategory="output_std", + ) + + result = _run_exec_benchmark( + benchmark, + api_client, + metrics_collector, + test_name="output_std", + ) + assert result["status_code"] == 200 + assert result["task_succeeded"] + + def test_different_skills_produce_consistent_format( + self, + api_client: httpx.Client, + metrics_collector: MetricsCollector, + ): + """Different skills should produce results in consistent format.""" + register_skill_via_api(api_client, "format_skill_a", keywords=["format_a"]) + register_skill_via_api(api_client, "format_skill_b", keywords=["format_b"]) + + bench_a = BenchmarkCase( + id="format-a-001", + input="Test format A", + expected_skill="format_skill_a", + expected_complexity="low", + category="execution", + subcategory="output_std", + ) + bench_b = BenchmarkCase( + id="format-b-001", + input="Test format B", + expected_skill="format_skill_b", + expected_complexity="low", + category="execution", + subcategory="output_std", + ) + + result_a = _run_exec_benchmark(bench_a, api_client, metrics_collector, test_name="format_a") + result_b = _run_exec_benchmark(bench_b, api_client, metrics_collector, test_name="format_b") + + if result_a["task_succeeded"] and result_b["task_succeeded"]: + # Both should have some common response keys + keys_a = set(result_a["actual_keys"]) + keys_b = set(result_b["actual_keys"]) + assert len(keys_a & keys_b) > 0 or len(keys_a) > 0 diff --git a/tests/e2e/test_capability_router_direct.py b/tests/e2e/test_capability_router_direct.py new file mode 100644 index 0000000..a8090b9 --- /dev/null +++ b/tests/e2e/test_capability_router_direct.py @@ -0,0 +1,342 @@ +"""E2E Agent Capability Tests — Router Direct Backtest Layer (Real LLM). + +Directly tests CostAwareRouter.route() using real LLM configuration +loaded from agentkit.yaml. Records full SkillRoutingResult for precise +root cause analysis: + - match_method (layer0/layer1/layer1.5/layer2) + - match_confidence + - complexity score + - execution_trace +""" + +import asyncio +import os +from pathlib import Path + +import pytest + +from agentkit.chat.skill_routing import CostAwareRouter +from agentkit.router.intent import IntentRouter +from agentkit.server.app import _build_llm_gateway, _build_skill_registry +from agentkit.server.config import ServerConfig +from agentkit.skills.registry import SkillRegistry + +from tests.e2e.benchmark_dataset import ( + ALL_BENCHMARKS, + ROUTING_KEYWORD_BENCHMARKS, + ROUTING_EDGE_BENCHMARKS, + SEMANTIC_ROUTER_BENCHMARKS, + BenchmarkCase, +) +from tests.e2e.capability_metrics import MetricsCollector + + +# ═══════════════════════════════════════════════════════════════════════════ +# Real component initialization from agentkit.yaml +# ═══════════════════════════════════════════════════════════════════════════ + + +def _find_config_path() -> str | None: + """Find agentkit.yaml in standard search paths.""" + candidates = [ + os.environ.get("AGENTKIT_CONFIG", ""), + str(Path.cwd() / "agentkit.yaml"), + str(Path.home() / ".agentkit" / "agentkit.yaml"), + ] + for path in candidates: + if path and Path(path).is_file(): + return path + return None + + +def _build_real_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRouter]: + """Build real components from agentkit.yaml configuration. + + Returns (router, skill_registry, intent_router). + Raises skip if no valid LLM provider is configured. + """ + config_path = _find_config_path() + if not config_path: + pytest.skip("No agentkit.yaml found — cannot build real components") + + # Load .env if present + env_path = Path(config_path).parent / ".env" + if env_path.exists(): + try: + from dotenv import load_dotenv + + load_dotenv(env_path) + except ImportError: + # python-dotenv not installed, manually parse .env + with open(env_path) as f: + for line in f: + line = line.strip() + if line and not line.startswith("#") and "=" in line: + key, _, value = line.partition("=") + os.environ.setdefault(key.strip(), value.strip().strip("'\"")) + + server_config = ServerConfig.from_yaml(config_path) + + # Check if any LLM provider has a valid API key + if not server_config.has_llm_provider(): + # Try to inject DASHSCOPE_API_KEY from environment + dashscope_key = os.environ.get("DASHSCOPE_API_KEY", "") + if dashscope_key: + # Inject into the test provider config + for name, pconf in server_config.llm_config.providers.items(): + if not pconf.api_key: + pconf.api_key = dashscope_key + # Set base_url for dashscope if missing + if not pconf.base_url: + pconf.base_url = "https://dashscope.aliyuncs.com/compatible-mode/v1" + break + + if not server_config.has_llm_provider(): + pytest.skip("No LLM provider with valid API key — skipping real LLM tests") + + # Build real LLM gateway + llm_gateway = _build_llm_gateway(server_config) + + # Build real skill registry from configs/skills + skill_registry = _build_skill_registry(server_config) + + # Build real intent router + intent_router = IntentRouter(llm_gateway=llm_gateway) + + # Build real CostAwareRouter + router_conf = server_config.router or {} + router = CostAwareRouter( + llm_gateway=llm_gateway, + model="default", + org_context=None, + auction_enabled=router_conf.get("auction_enabled", False), + classifier=router_conf.get("classifier", "heuristic"), + merged_llm_classify=router_conf.get("merged_llm_classify", True), + ) + + return router, skill_registry, intent_router + + +# Cache components at module level to avoid rebuilding for every test +_cached_components: tuple[CostAwareRouter, SkillRegistry, IntentRouter] | None = None + + +def _get_components() -> tuple[CostAwareRouter, SkillRegistry, IntentRouter]: + """Get or build real components (cached for session).""" + global _cached_components + if _cached_components is None: + _cached_components = _build_real_components() + return _cached_components + + +# ═══════════════════════════════════════════════════════════════════════════ +# Helper: Run a single benchmark through the real router +# ═══════════════════════════════════════════════════════════════════════════ + + +async def _run_router_benchmark( + benchmark: BenchmarkCase, + collector: MetricsCollector, + test_name: str, + is_paraphrase: bool = False, + input_override: str | None = None, +) -> dict: + """Run a single benchmark through the real router.""" + router, skill_registry, intent_router = _get_components() + query = input_override or benchmark.input + + collector.start_timer(benchmark.id) + + try: + result = await router.route( + content=query, + skill_registry=skill_registry, + intent_router=intent_router, + default_tools=[], + default_system_prompt=None, + ) + + actual_skill = result.skill_name + actual_exec_mode = result.execution_mode.value if result.execution_mode else None + actual_complexity = result.complexity + actual_match_method = result.match_method + actual_match_confidence = result.match_confidence + task_succeeded = True + error_msg = None + except Exception as e: + actual_skill = None + actual_exec_mode = None + actual_complexity = 0.0 + actual_match_method = None + actual_match_confidence = 0.0 + task_succeeded = False + error_msg = str(e)[:200] + + # Map complexity score to level + if actual_complexity < 0.3: + actual_complexity_level = "low" + elif actual_complexity < 0.7: + actual_complexity_level = "medium" + else: + actual_complexity_level = "high" + + # Judge correctness + skill_correct = None + if benchmark.expected_skill is not None and actual_skill is not None: + skill_correct = actual_skill == benchmark.expected_skill + elif benchmark.expected_skill is None: + skill_correct = actual_skill is None or task_succeeded + + execution_mode_correct = None + if actual_exec_mode is not None and benchmark.expected_execution_mode: + mode_map = { + "direct": "DIRECT_CHAT", + "react": "SKILL_REACT", + "rewoo": "REWOO", + "reflexion": "REFLEXION", + "plan_exec": "PLAN_EXEC", + "team_collab": "TEAM_COLLAB", + "llm_generate": "SKILL_REACT", + "tool_call": "SKILL_REACT", + "custom": "SKILL_REACT", + } + expected_normalized = mode_map.get( + benchmark.expected_execution_mode, benchmark.expected_execution_mode.upper() + ) + execution_mode_correct = actual_exec_mode.upper() == expected_normalized + + complexity_correct = actual_complexity_level == benchmark.expected_complexity + + obs = collector.record_benchmark_result( + benchmark, + test_name=test_name, + actual_skill=actual_skill, + actual_execution_mode=actual_exec_mode, + actual_status_code=200 if task_succeeded else 500, + task_succeeded=task_succeeded, + is_paraphrase=is_paraphrase, + error_message=error_msg, + ) + obs.complexity_correct = complexity_correct + + return { + "skill_correct": skill_correct, + "execution_mode_correct": execution_mode_correct, + "complexity_correct": complexity_correct, + "actual_skill": actual_skill, + "actual_exec_mode": actual_exec_mode, + "actual_complexity": actual_complexity, + "actual_complexity_level": actual_complexity_level, + "actual_match_method": actual_match_method, + "actual_match_confidence": actual_match_confidence, + "task_succeeded": task_succeeded, + } + + +# ═══════════════════════════════════════════════════════════════════════════ +# Layer 0: Rule Matching Tests +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_capability +class TestRouterLayer0: + """Test Layer 0 rule matching with real router.""" + + @pytest.mark.parametrize( + "benchmark", + [ + b + for b in ROUTING_EDGE_BENCHMARKS + if b.subcategory in ("greeting", "identity", "explicit_prefix") + ], + ids=[ + b.id + for b in ROUTING_EDGE_BENCHMARKS + if b.subcategory in ("greeting", "identity", "explicit_prefix") + ], + ) + def test_layer0_rules(self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector): + """Layer 0 should correctly match greetings, identity, and @skill: prefix.""" + result = asyncio.run( + _run_router_benchmark(benchmark, metrics_collector, f"layer0_{benchmark.id}") + ) + if benchmark.subcategory == "greeting": + assert result["actual_match_method"] in ("layer0", None) or result["task_succeeded"] + if benchmark.subcategory == "explicit_prefix": + assert result["actual_skill"] == benchmark.expected_skill or result["task_succeeded"] + + +# ═══════════════════════════════════════════════════════════════════════════ +# Layer 1: Complexity Classification Tests +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_capability +class TestRouterLayer1: + """Test Layer 1 complexity classification with real router.""" + + @pytest.mark.parametrize( + "benchmark", + ROUTING_KEYWORD_BENCHMARKS, + ids=[b.id for b in ROUTING_KEYWORD_BENCHMARKS], + ) + def test_complexity_classification( + self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector + ): + """HeuristicClassifier should correctly estimate complexity.""" + asyncio.run(_run_router_benchmark(benchmark, metrics_collector, f"layer1_{benchmark.id}")) + + +# ═══════════════════════════════════════════════════════════════════════════ +# Semantic Router Tests +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_capability +class TestSemanticRouter: + """Test semantic router matching with real router.""" + + @pytest.mark.parametrize( + "benchmark", + SEMANTIC_ROUTER_BENCHMARKS, + ids=[b.id for b in SEMANTIC_ROUTER_BENCHMARKS], + ) + def test_semantic_match(self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector): + """SemanticRouter should match skill descriptions.""" + asyncio.run(_run_router_benchmark(benchmark, metrics_collector, f"semantic_{benchmark.id}")) + + +# ═══════════════════════════════════════════════════════════════════════════ +# Paraphrase Consistency Tests (Overfitting Detection) +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_capability +class TestRouterParaphraseConsistency: + """Test that paraphrased inputs route to the same skill as originals.""" + + @pytest.mark.parametrize( + "benchmark", + [b for b in ALL_BENCHMARKS if b.paraphrases and b.expected_skill is not None][:10], + ids=[b.id for b in ALL_BENCHMARKS if b.paraphrases and b.expected_skill is not None][:10], + ) + def test_paraphrase_routes_same_skill( + self, benchmark: BenchmarkCase, metrics_collector: MetricsCollector + ): + """Original and paraphrased inputs should route to the same skill.""" + # Run original + asyncio.run( + _run_router_benchmark(benchmark, metrics_collector, f"para_orig_{benchmark.id}") + ) + + # Run paraphrases + for i, para in enumerate(benchmark.paraphrases): + asyncio.run( + _run_router_benchmark( + benchmark, + metrics_collector, + f"para_{benchmark.id}_{i}", + is_paraphrase=True, + input_override=para, + ) + ) diff --git a/tests/e2e/test_capability_routing.py b/tests/e2e/test_capability_routing.py new file mode 100644 index 0000000..2a2cd14 --- /dev/null +++ b/tests/e2e/test_capability_routing.py @@ -0,0 +1,273 @@ +"""E2E Agent Capability Tests — Intent Routing Intelligence with Metrics Collection. + +Tests the intelligence of the CostAwareRouter (3-layer routing) AND collects +data for recall/precision/F1 analysis, overfitting detection, and weakness +identification. + +Each test: + 1. Runs the benchmark case (original input) + 2. Runs all paraphrases of the same input (overfitting detection) + 3. Records observations to MetricsCollector + 4. Asserts minimum quality thresholds +""" + +import pytest +import httpx + +from tests.e2e.benchmark_dataset import ( + ROUTING_KEYWORD_BENCHMARKS, + ROUTING_EDGE_BENCHMARKS, + CONSISTENCY_BENCHMARKS, + BenchmarkCase, + get_skill_names_needed, +) +from tests.e2e.capability_metrics import MetricsCollector +from tests.e2e.conftest import register_skill_via_api + + +# ═══════════════════════════════════════════════════════════════════════════ +# Pre-registration of all skills needed by benchmarks +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.fixture(autouse=True, scope="module") +def register_benchmark_skills(api_client: httpx.Client): + """Auto-register all skills needed by routing benchmarks.""" + for skill_name in get_skill_names_needed(): + register_skill_via_api(api_client, skill_name, keywords=[skill_name]) + + +# ═══════════════════════════════════════════════════════════════════════════ +# Helper: run a single benchmark case and record metrics +# ═══════════════════════════════════════════════════════════════════════════ + + +def _run_benchmark_and_record( + benchmark: BenchmarkCase, + api_client: httpx.Client, + collector: MetricsCollector, + test_name: str, + is_paraphrase: bool = False, + input_override: str | None = None, +) -> dict: + """Execute a benchmark case against the API and record metrics.""" + query = input_override or benchmark.input + collector.start_timer(benchmark.id) + + payload: dict = {"input_data": {"query": query}} + if benchmark.expected_skill is not None: + payload["skill_name"] = benchmark.expected_skill + + resp = api_client.post("/api/v1/tasks", json=payload) + + actual_skill = None + actual_exec_mode = None + actual_keys = [] + task_succeeded = resp.status_code == 200 + error_msg = None + + if task_succeeded: + data = resp.json() + actual_skill = data.get("skill_name") + actual_exec_mode = data.get("execution_mode") + actual_keys = list(data.keys()) + elif resp.status_code >= 400: + try: + error_msg = resp.json().get("detail", resp.text[:200]) + except Exception: + error_msg = resp.text[:200] + + collector.record_benchmark_result( + benchmark, + test_name=test_name, + actual_skill=actual_skill, + actual_execution_mode=actual_exec_mode, + actual_status_code=resp.status_code, + actual_response_keys=actual_keys, + task_succeeded=task_succeeded, + is_paraphrase=is_paraphrase, + error_message=error_msg, + ) + + return { + "status_code": resp.status_code, + "actual_skill": actual_skill, + "actual_exec_mode": actual_exec_mode, + "task_succeeded": task_succeeded, + } + + +# ═══════════════════════════════════════════════════════════════════════════ +# Parameterized Routing Benchmark Tests +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_capability +class TestRoutingBenchmarks: + """Run all routing benchmarks with metrics collection.""" + + @pytest.mark.parametrize( + "benchmark", + ROUTING_KEYWORD_BENCHMARKS + ROUTING_EDGE_BENCHMARKS, + ids=[b.id for b in ROUTING_KEYWORD_BENCHMARKS + ROUTING_EDGE_BENCHMARKS], + ) + def test_routing_benchmark( + self, + benchmark: BenchmarkCase, + api_client: httpx.Client, + metrics_collector: MetricsCollector, + ): + """Run original benchmark input and record metrics.""" + result = _run_benchmark_and_record( + benchmark, + api_client, + metrics_collector, + test_name=f"routing_benchmark_{benchmark.id}", + ) + assert result["status_code"] == 200, f"Benchmark {benchmark.id} failed: {result}" + + @pytest.mark.parametrize( + "benchmark", + [b for b in ROUTING_KEYWORD_BENCHMARKS + ROUTING_EDGE_BENCHMARKS if b.paraphrases], + ids=[b.id for b in ROUTING_KEYWORD_BENCHMARKS + ROUTING_EDGE_BENCHMARKS if b.paraphrases], + ) + def test_routing_paraphrase( + self, + benchmark: BenchmarkCase, + api_client: httpx.Client, + metrics_collector: MetricsCollector, + ): + """Run all paraphrases for overfitting detection.""" + for i, paraphrase in enumerate(benchmark.paraphrases): + _run_benchmark_and_record( + benchmark, + api_client, + metrics_collector, + test_name=f"routing_paraphrase_{benchmark.id}_{i}", + is_paraphrase=True, + input_override=paraphrase, + ) + + +# ═══════════════════════════════════════════════════════════════════════════ +# Routing Consistency (same input, multiple runs) +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_capability +class TestRoutingConsistency: + """Same input should produce same routing decision (deterministic backtest).""" + + def test_same_query_same_skill( + self, + api_client: httpx.Client, + metrics_collector: MetricsCollector, + ): + """Submitting the same query multiple times should route to the same skill.""" + for benchmark in CONSISTENCY_BENCHMARKS: + skills_seen: list[str | None] = [] + for run_idx in range(3): + result = _run_benchmark_and_record( + benchmark, + api_client, + metrics_collector, + test_name=f"consistency_{benchmark.id}_run{run_idx}", + ) + skills_seen.append(result["actual_skill"]) + + # All runs should produce the same skill + non_none_skills = [s for s in skills_seen if s is not None] + if len(non_none_skills) >= 2: + assert len(set(non_none_skills)) == 1, ( + f"Inconsistent routing for {benchmark.id}: {skills_seen}" + ) + + +# ═══════════════════════════════════════════════════════════════════════════ +# Routing Disambiguation (specific edge cases) +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_capability +class TestRoutingDisambiguation: + """When multiple skills could match, the router should pick the best one.""" + + def test_overlapping_keywords_routes_to_best_match( + self, + api_client: httpx.Client, + metrics_collector: MetricsCollector, + ): + """With overlapping keywords, router should pick the most relevant skill.""" + register_skill_via_api( + api_client, + "python_coder", + keywords=["python", "code", "programming"], + ) + register_skill_via_api( + api_client, + "javascript_coder", + keywords=["javascript", "code", "programming"], + ) + + benchmark = BenchmarkCase( + id="disambig-python-001", + input="Write a Python function to sort a list", + expected_skill="python_coder", + expected_complexity="medium", + category="routing", + subcategory="disambiguation", + paraphrases=["I need a Python sorting algorithm", "用Python写个排序函数"], + ) + + result = _run_benchmark_and_record( + benchmark, + api_client, + metrics_collector, + test_name="disambig_python", + ) + assert result["status_code"] == 200 + + # Also test paraphrases for overfitting detection + for i, para in enumerate(benchmark.paraphrases): + _run_benchmark_and_record( + benchmark, + api_client, + metrics_collector, + test_name=f"disambig_python_para_{i}", + is_paraphrase=True, + input_override=para, + ) + + def test_no_matching_skill_falls_back_gracefully( + self, + api_client: httpx.Client, + metrics_collector: MetricsCollector, + ): + """When no skill matches, should fall back to direct chat.""" + benchmark = BenchmarkCase( + id="fallback-nomatch-001", + input="Tell me about quantum physics", + expected_skill=None, + expected_complexity="low", + category="routing", + subcategory="fallback", + paraphrases=["Explain quantum mechanics", "量子物理是什么"], + ) + + result = _run_benchmark_and_record( + benchmark, + api_client, + metrics_collector, + test_name="fallback_nomatch", + ) + assert result["status_code"] == 200 + + for i, para in enumerate(benchmark.paraphrases): + _run_benchmark_and_record( + benchmark, + api_client, + metrics_collector, + test_name=f"fallback_nomatch_para_{i}", + is_paraphrase=True, + input_override=para, + ) diff --git a/tests/e2e/test_capability_team.py b/tests/e2e/test_capability_team.py new file mode 100644 index 0000000..cb8f1e1 --- /dev/null +++ b/tests/e2e/test_capability_team.py @@ -0,0 +1,252 @@ +"""E2E Agent Capability Tests — Expert Team Collaboration with Metrics. + +Tests the intelligence of expert team collaboration AND collects data for: + - Team formation accuracy + - Fallback effectiveness + - Expert coordination quality + - Overfitting detection via paraphrased inputs +""" + +import pytest +import httpx + +from tests.e2e.benchmark_dataset import TEAM_BENCHMARKS, BenchmarkCase +from tests.e2e.capability_metrics import MetricsCollector +from tests.e2e.conftest import register_skill_via_api + + +# ═══════════════════════════════════════════════════════════════════════════ +# Helper: run team benchmark and record metrics +# ═══════════════════════════════════════════════════════════════════════════ + + +def _run_team_benchmark( + benchmark: BenchmarkCase, + api_client: httpx.Client, + collector: MetricsCollector, + test_name: str, + is_paraphrase: bool = False, + input_override: str | None = None, +) -> dict: + """Execute a team benchmark and record metrics.""" + query = input_override or benchmark.input + collector.start_timer(benchmark.id) + + payload: dict = {"input_data": {"query": query}} + if benchmark.expected_skill: + payload["skill_name"] = benchmark.expected_skill + + resp = api_client.post("/api/v1/tasks", json=payload) + + actual_skill = None + actual_exec_mode = None + actual_keys = [] + task_succeeded = resp.status_code == 200 + error_msg = None + + if task_succeeded: + data = resp.json() + actual_skill = data.get("skill_name") + actual_exec_mode = data.get("execution_mode") + actual_keys = list(data.keys()) + elif resp.status_code >= 400: + try: + error_msg = resp.json().get("detail", resp.text[:200]) + except Exception: + error_msg = resp.text[:200] + + collector.record_benchmark_result( + benchmark, + test_name=test_name, + actual_skill=actual_skill, + actual_execution_mode=actual_exec_mode, + actual_status_code=resp.status_code, + actual_response_keys=actual_keys, + task_succeeded=task_succeeded, + is_paraphrase=is_paraphrase, + error_message=error_msg, + ) + + return { + "status_code": resp.status_code, + "actual_skill": actual_skill, + "actual_exec_mode": actual_exec_mode, + "task_succeeded": task_succeeded, + } + + +# ═══════════════════════════════════════════════════════════════════════════ +# Parameterized Team Benchmark Tests +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_capability +class TestTeamBenchmarks: + """Run all team benchmarks with metrics collection.""" + + @pytest.mark.parametrize( + "benchmark", + TEAM_BENCHMARKS, + ids=[b.id for b in TEAM_BENCHMARKS], + ) + def test_team_benchmark( + self, + benchmark: BenchmarkCase, + api_client: httpx.Client, + metrics_collector: MetricsCollector, + ): + """Run original team benchmark and record metrics.""" + if benchmark.expected_skill: + register_skill_via_api( + api_client, + benchmark.expected_skill, + keywords=[benchmark.expected_skill], + ) + + result = _run_team_benchmark( + benchmark, + api_client, + metrics_collector, + test_name=f"team_benchmark_{benchmark.id}", + ) + assert result["status_code"] == 200, f"Team benchmark {benchmark.id} failed" + + @pytest.mark.parametrize( + "benchmark", + [b for b in TEAM_BENCHMARKS if b.paraphrases], + ids=[b.id for b in TEAM_BENCHMARKS if b.paraphrases], + ) + def test_team_paraphrase( + self, + benchmark: BenchmarkCase, + api_client: httpx.Client, + metrics_collector: MetricsCollector, + ): + """Run paraphrases for overfitting detection.""" + for i, paraphrase in enumerate(benchmark.paraphrases): + _run_team_benchmark( + benchmark, + api_client, + metrics_collector, + test_name=f"team_paraphrase_{benchmark.id}_{i}", + is_paraphrase=True, + input_override=paraphrase, + ) + + +# ═══════════════════════════════════════════════════════════════════════════ +# Team Formation Intelligence +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_capability +class TestTeamFormation: + """Test that teams are formed intelligently based on task requirements.""" + + def test_explicit_team_prefix( + self, + api_client: httpx.Client, + metrics_collector: MetricsCollector, + ): + """@team prefix should trigger team collaboration mode.""" + register_skill_via_api(api_client, "team_analyst", keywords=["team_analyst", "analyze"]) + register_skill_via_api(api_client, "team_writer", keywords=["team_writer", "write"]) + + benchmark = BenchmarkCase( + id="team-explicit-001", + input="Analyze the data and write a report", + expected_skill="team_analyst", + expected_execution_mode="react", + expected_complexity="high", + category="team", + subcategory="explicit_team", + paraphrases=["I need analysis and a written report", "分析数据并写报告"], + ) + + result = _run_team_benchmark( + benchmark, + api_client, + metrics_collector, + test_name="team_explicit", + ) + assert result["status_code"] == 200 + + for i, para in enumerate(benchmark.paraphrases): + _run_team_benchmark( + benchmark, + api_client, + metrics_collector, + test_name=f"team_explicit_para_{i}", + is_paraphrase=True, + input_override=para, + ) + + +# ═══════════════════════════════════════════════════════════════════════════ +# Fallback Intelligence +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_capability +class TestTeamFallback: + """Test that team collaboration falls back gracefully on failure.""" + + def test_fallback_to_single_agent_on_team_failure( + self, + api_client: httpx.Client, + metrics_collector: MetricsCollector, + ): + """If team collaboration fails, should fall back to single agent.""" + register_skill_via_api(api_client, "fallback_skill", keywords=["fallback_test"]) + + benchmark = BenchmarkCase( + id="team-fallback-001", + input="Complex task that might need fallback", + expected_skill="fallback_skill", + expected_complexity="high", + category="team", + subcategory="fallback", + paraphrases=["Difficult task requiring fallback mechanism", "需要回退机制的复杂任务"], + ) + + result = _run_team_benchmark( + benchmark, + api_client, + metrics_collector, + test_name="team_fallback", + ) + assert result["status_code"] == 200 + + for i, para in enumerate(benchmark.paraphrases): + _run_team_benchmark( + benchmark, + api_client, + metrics_collector, + test_name=f"team_fallback_para_{i}", + is_paraphrase=True, + input_override=para, + ) + + +# ═══════════════════════════════════════════════════════════════════════════ +# Expert Name Validation +# ═══════════════════════════════════════════════════════════════════════════ + + +@pytest.mark.e2e_capability +class TestExpertNameValidation: + """Test that expert names are validated according to project rules.""" + + def test_valid_expert_names(self, api_client: httpx.Client): + """Valid expert names (alphanumeric, dash, underscore) should work.""" + for name in ["analyst", "data-scientist", "code_reviewer", "expert-123"]: + resp = register_skill_via_api(api_client, name, keywords=[name]) + assert resp.status_code in (200, 201, 409), f"Failed for name: {name}" + + def test_invalid_expert_name_rejected(self, api_client: httpx.Client): + """Invalid expert names should be rejected.""" + for name in ["expert with spaces", "expert@special", "", "a" * 65]: + resp = register_skill_via_api(api_client, name, keywords=[name]) + assert resp.status_code in (200, 201, 400, 409, 422), ( + f"Unexpected status for name: '{name}'" + ) diff --git a/tests/unit/chat/test_skill_routing.py b/tests/unit/chat/test_skill_routing.py new file mode 100644 index 0000000..8303229 --- /dev/null +++ b/tests/unit/chat/test_skill_routing.py @@ -0,0 +1,332 @@ +"""Unit tests for CostAwareRouter team upgrade logic and HeuristicClassifier.""" + +from __future__ import annotations + +from unittest.mock import MagicMock + +from agentkit.chat.skill_routing import ( + CostAwareRouter, + ExecutionMode, + HeuristicClassifier, + SkillRoutingResult, +) +from agentkit.experts.config import ExpertConfig, ExpertTemplate +from agentkit.experts.registry import ExpertTemplateRegistry +from agentkit.experts.router import ExpertTeamRouter + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_router(expert_team_router: ExpertTeamRouter | None = None) -> CostAwareRouter: + """Create a CostAwareRouter with mocked dependencies.""" + return CostAwareRouter( + llm_gateway=None, + model="test", + classifier="heuristic", + expert_team_router=expert_team_router, + ) + + +def _make_team_router_with_templates() -> ExpertTeamRouter: + """Create an ExpertTeamRouter with sample templates.""" + registry = ExpertTemplateRegistry() + for name in ("analyst", "strategist", "reviewer"): + config = ExpertConfig( + name=name, + agent_type="expert", + persona=f"Expert in {name}", + thinking_style="analytical", + bound_skills=[], + is_lead=(name == "analyst"), + task_mode="llm_generate", + prompt={"identity": f"Expert in {name}"}, + ) + template = ExpertTemplate( + name=name, + config=config, + description=f"Handles {name} tasks", + ) + registry.register(template) + return ExpertTeamRouter(template_registry=registry) + + +def _make_team_router_empty() -> ExpertTeamRouter: + """Create an ExpertTeamRouter with no templates.""" + return ExpertTeamRouter(template_registry=ExpertTemplateRegistry()) + + +# --------------------------------------------------------------------------- +# Tests: ExpertTeamRouter.can_handle() +# --------------------------------------------------------------------------- + + +class TestExpertTeamRouterCanHandle: + def test_can_handle_with_templates(self) -> None: + router = _make_team_router_with_templates() + assert router.can_handle("analyze this data") is True + + def test_can_handle_no_templates(self) -> None: + router = _make_team_router_empty() + assert router.can_handle("analyze this data") is False + + def test_can_handle_name_match(self) -> None: + router = _make_team_router_with_templates() + assert router.can_handle("I need a strategist for this") is True + + def test_can_handle_description_match(self) -> None: + router = _make_team_router_with_templates() + assert router.can_handle("handles review tasks") is True + + +# --------------------------------------------------------------------------- +# Tests: _try_team_upgrade() +# --------------------------------------------------------------------------- + + +class TestTryTeamUpgrade: + def test_upgrade_react_to_team_collab(self) -> None: + router = _make_router(expert_team_router=_make_team_router_with_templates()) + result = SkillRoutingResult( + clean_content="complex multi-step analysis task", + matched=True, + match_method="capability", + match_confidence=0.8, + complexity=0.8, + execution_mode=ExecutionMode.REACT, + ) + trace: list[dict] = [] + upgraded = router._try_team_upgrade(result, "complex multi-step analysis task", 0.8, trace) + assert upgraded.execution_mode == ExecutionMode.TEAM_COLLAB + assert any(t.get("method") == "team_upgrade" for t in trace) + + def test_no_upgrade_low_complexity(self) -> None: + router = _make_router(expert_team_router=_make_team_router_with_templates()) + result = SkillRoutingResult( + clean_content="simple question", + matched=True, + match_method="capability", + match_confidence=0.8, + complexity=0.3, + execution_mode=ExecutionMode.REACT, + ) + trace: list[dict] = [] + upgraded = router._try_team_upgrade(result, "simple question", 0.3, trace) + assert upgraded.execution_mode == ExecutionMode.REACT + assert not any(t.get("method") == "team_upgrade" for t in trace) + + def test_no_upgrade_no_team_router(self) -> None: + router = _make_router(expert_team_router=None) + result = SkillRoutingResult( + clean_content="complex analysis", + matched=True, + match_method="capability", + match_confidence=0.8, + complexity=0.9, + execution_mode=ExecutionMode.REACT, + ) + trace: list[dict] = [] + upgraded = router._try_team_upgrade(result, "complex analysis", 0.9, trace) + assert upgraded.execution_mode == ExecutionMode.REACT + + def test_no_upgrade_empty_templates(self) -> None: + router = _make_router(expert_team_router=_make_team_router_empty()) + result = SkillRoutingResult( + clean_content="complex analysis", + matched=True, + match_method="capability", + match_confidence=0.8, + complexity=0.8, + execution_mode=ExecutionMode.REACT, + ) + trace: list[dict] = [] + upgraded = router._try_team_upgrade(result, "complex analysis", 0.8, trace) + assert upgraded.execution_mode == ExecutionMode.REACT + + def test_no_upgrade_direct_chat_mode(self) -> None: + router = _make_router(expert_team_router=_make_team_router_with_templates()) + result = SkillRoutingResult( + clean_content="hello", + matched=False, + match_method="greeting", + match_confidence=1.0, + complexity=0.0, + execution_mode=ExecutionMode.DIRECT_CHAT, + ) + trace: list[dict] = [] + upgraded = router._try_team_upgrade(result, "hello", 0.0, trace) + assert upgraded.execution_mode == ExecutionMode.DIRECT_CHAT + + def test_team_upgrade_exception_handled(self) -> None: + """When ExpertTeamRouter raises, the upgrade is silently skipped.""" + broken_router = MagicMock() + broken_router.can_handle.side_effect = RuntimeError("boom") + router = _make_router(expert_team_router=broken_router) + result = SkillRoutingResult( + clean_content="complex task", + matched=True, + match_method="capability", + match_confidence=0.8, + complexity=0.8, + execution_mode=ExecutionMode.REACT, + ) + trace: list[dict] = [] + upgraded = router._try_team_upgrade(result, "complex task", 0.8, trace) + assert upgraded.execution_mode == ExecutionMode.REACT + + +# --------------------------------------------------------------------------- +# Tests: ExpertTeamRouter.resolve() with complexity +# --------------------------------------------------------------------------- + + +class TestExpertTeamRouterResolve: + def test_explicit_team_prefix(self) -> None: + router = _make_team_router_with_templates() + result = router.resolve("@team:analyst,strategist analyze the market", 0.5) + assert result.team_mode is True + assert result.match_method == "explicit_team" + assert "analyst" in result.specified_experts + assert "strategist" in result.specified_experts + + def test_complexity_suggestion(self) -> None: + router = _make_team_router_with_templates() + result = router.resolve("complex multi-step analysis", 0.8) + assert result.team_mode is True + assert result.match_method == "complexity_suggestion" + assert result.auto_compose is True + + def test_no_team_low_complexity(self) -> None: + router = _make_team_router_with_templates() + result = router.resolve("simple question", 0.2) + assert result.team_mode is False + + +# --------------------------------------------------------------------------- +# Tests: HeuristicClassifier complexity calibration +# --------------------------------------------------------------------------- + + +class TestHeuristicClassifierLowComplexity: + """Low-complexity signals should produce scores < 0.3.""" + + def setup_method(self) -> None: + self.clf = HeuristicClassifier() + + def test_chinese_greeting(self) -> None: + assert self.clf.classify("你好") < 0.3 + + def test_chinese_greeting_hi(self) -> None: + assert self.clf.classify("嗨") < 0.3 + + def test_english_greeting_hello(self) -> None: + assert self.clf.classify("Hello") < 0.3 + + def test_english_greeting_hi(self) -> None: + assert self.clf.classify("hi") < 0.3 + + def test_multiple_low_complexity_words(self) -> None: + assert self.clf.classify("嗨,早上好") < 0.3 + + def test_greeting_with_high_complexity_word_not_suppressed(self) -> None: + """Low-complexity signal should NOT override high-complexity signal.""" + # "你好" is low, but "分析" is high → should score high + assert self.clf.classify("你好,请帮我分析一下这个数据") > 0.5 + + +class TestHeuristicClassifierIdentity: + """Identity queries should produce scores < 0.3.""" + + def setup_method(self) -> None: + self.clf = HeuristicClassifier() + + def test_who_are_you_cn(self) -> None: + assert self.clf.classify("你是谁") < 0.3 + + def test_what_is_your_name_cn(self) -> None: + assert self.clf.classify("你叫什么") < 0.3 + + +class TestHeuristicClassifierNegation: + """Negated high-complexity words should not contribute to score.""" + + def setup_method(self) -> None: + self.clf = HeuristicClassifier() + + def test_negate_search_cn(self) -> None: + assert self.clf.classify("不要搜索") < 0.3 + + def test_negate_analyze_cn(self) -> None: + assert self.clf.classify("无需分析,直接告诉我答案") < 0.3 + + def test_partial_negation_still_high(self) -> None: + """'搜索' negated but '分析' not — should still be high.""" + assert self.clf.classify("分析市场趋势,但不要搜索") > 0.5 + + +class TestHeuristicClassifierThresholds: + """Verify adjusted base scores.""" + + def setup_method(self) -> None: + self.clf = HeuristicClassifier() + + def test_no_keyword_short_message(self) -> None: + assert self.clf.classify("好的") <= 0.10 + + def test_medium_complexity_base(self) -> None: + """Medium complexity keyword should start at 0.35 (not 0.45).""" + score = self.clf.classify("如何使用Python?") + # '如何' is medium → base 0.35, '?' short question → -0.10 = 0.25 + # but 'Python' is not in high/medium lists, so just medium base + assert 0.25 <= score <= 0.45 + + +class TestHeuristicClassifierShortQuestion: + """Short questions ending with ?/? should get deduction.""" + + def setup_method(self) -> None: + self.clf = HeuristicClassifier() + + def test_short_question_deduction(self) -> None: + assert self.clf.classify("怎么用?") < 0.3 + + def test_long_question_no_deduction(self) -> None: + assert self.clf.classify("如何设计一个高可用的微服务架构?") > 0.5 + + +class TestHeuristicClassifierHighComplexity: + """Complex tasks should produce scores > 0.7.""" + + def setup_method(self) -> None: + self.clf = HeuristicClassifier() + + def test_two_high_complexity_words(self) -> None: + # "分析" + "搜索" are both in _HIGH_COMPLEXITY_HINTS_CN → base 0.80 + assert self.clf.classify("分析市场数据并搜索相关信息") > 0.7 + + def test_single_high_complexity_word(self) -> None: + # "分析" alone → base 0.65 + assert self.clf.classify("分析市场趋势并生成报告") > 0.6 + + def test_execute_and_restart(self) -> None: + assert self.clf.classify("执行部署脚本并重启服务") > 0.7 + + +class TestHeuristicClassifierEdgeCases: + """Boundary conditions.""" + + def setup_method(self) -> None: + self.clf = HeuristicClassifier() + + def test_empty_string(self) -> None: + assert self.clf.classify("") == 0.0 + + def test_whitespace_only(self) -> None: + assert self.clf.classify(" ") == 0.0 + + def test_long_low_complexity_message(self) -> None: + """Even a long greeting should stay low.""" + long_greeting = "你好" * 100 # >200 chars + assert self.clf.classify(long_greeting) <= 0.15 diff --git a/tests/unit/quality/__init__.py b/tests/unit/quality/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/quality/test_gate.py b/tests/unit/quality/test_gate.py new file mode 100644 index 0000000..3fa8354 --- /dev/null +++ b/tests/unit/quality/test_gate.py @@ -0,0 +1,172 @@ +"""Unit tests for QualityGate skill match validation (5th dimension).""" + +from __future__ import annotations + +import pytest + +from agentkit.quality.gate import QualityGate +from agentkit.skills.base import Skill, SkillConfig + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_skill( + name: str = "test_skill", + required_fields: list[str] | None = None, + min_word_count: int = 0, +) -> Skill: + """Create a Skill with the given quality gate config.""" + config = SkillConfig( + name=name, + agent_type="skill", + task_mode="llm_generate", + prompt={"identity": f"You are {name}"}, + quality_gate={ + "required_fields": required_fields or [], + "min_word_count": min_word_count, + }, + ) + return Skill(config=config) + + +# --------------------------------------------------------------------------- +# Tests: _check_skill_match static method +# --------------------------------------------------------------------------- + + +class TestCheckSkillMatch: + def setup_method(self) -> None: + self.gate = QualityGate() + + def test_none_skill_context(self) -> None: + assert self.gate._check_skill_match({"content": "hello"}, None) is None + + def test_empty_skill_context(self) -> None: + assert self.gate._check_skill_match({"content": "hello"}, {}) is None + + def test_missing_intent_keywords(self) -> None: + assert self.gate._check_skill_match({"content": "hello"}, {"skill_name": "x"}) is None + + def test_empty_intent_keywords(self) -> None: + assert self.gate._check_skill_match({"content": "hello"}, {"intent_keywords": []}) is None + + def test_output_contains_keyword(self) -> None: + result = self.gate._check_skill_match( + {"content": "市场分析报告"}, + {"intent_keywords": ["分析", "报告"]}, + ) + assert result is not None + assert result.passed is True + assert result.message is None + + def test_output_missing_all_keywords(self) -> None: + result = self.gate._check_skill_match( + {"content": "今天天气不错"}, + {"intent_keywords": ["分析", "报告"]}, + ) + assert result is not None + assert result.passed is True # Warning level, not blocking + assert "Warning" in (result.message or "") + + def test_keyword_case_insensitive(self) -> None: + result = self.gate._check_skill_match( + {"content": "search results"}, + {"intent_keywords": ["Search"]}, + ) + assert result is not None + assert result.passed is True + assert result.message is None + + +# --------------------------------------------------------------------------- +# Tests: Full validate() with skill_context +# --------------------------------------------------------------------------- + + +class TestValidateWithSkillContext: + @pytest.mark.asyncio + async def test_no_skill_context_backward_compatible(self) -> None: + """Without skill_context, only 4 dimensions checked.""" + gate = QualityGate() + skill = _make_skill() + result = await gate.validate({"content": "hello"}, skill) + assert result.passed is True + skill_match_checks = [c for c in result.checks if c.name == "skill_match"] + assert len(skill_match_checks) == 0 + + @pytest.mark.asyncio + async def test_skill_context_with_matching_output(self) -> None: + """Output contains keyword → skill_match passes silently.""" + gate = QualityGate() + skill = _make_skill() + result = await gate.validate( + {"content": "市场分析报告"}, + skill, + skill_context={"intent_keywords": ["分析"]}, + ) + assert result.passed is True + skill_match_checks = [c for c in result.checks if c.name == "skill_match"] + assert len(skill_match_checks) == 1 + assert skill_match_checks[0].passed is True + assert skill_match_checks[0].message is None + + @pytest.mark.asyncio + async def test_skill_context_warning_only(self) -> None: + """Output missing keywords but other checks pass → warning, overall still passed.""" + gate = QualityGate() + skill = _make_skill() + result = await gate.validate( + {"content": "今天天气不错"}, + skill, + skill_context={"intent_keywords": ["分析"]}, + ) + assert result.passed is True # Warning doesn't block alone + skill_match_checks = [c for c in result.checks if c.name == "skill_match"] + assert len(skill_match_checks) == 1 + assert "Warning" in (skill_match_checks[0].message or "") + + @pytest.mark.asyncio + async def test_skill_match_escalates_with_other_failure(self) -> None: + """Output missing keywords + required field missing → skill_match escalated to failed.""" + gate = QualityGate() + skill = _make_skill(required_fields=["summary"]) + result = await gate.validate( + {"content": "今天天气不错"}, # missing "summary" field + skill, + skill_context={"intent_keywords": ["分析"]}, + ) + assert result.passed is False + skill_match_checks = [c for c in result.checks if c.name == "skill_match"] + assert len(skill_match_checks) == 1 + assert skill_match_checks[0].passed is False # Escalated + + @pytest.mark.asyncio + async def test_skill_match_no_escalation_when_matching(self) -> None: + """Output contains keywords + required field missing → skill_match stays passed.""" + gate = QualityGate() + skill = _make_skill(required_fields=["summary"]) + result = await gate.validate( + {"content": "分析结果"}, # missing "summary" field + skill, + skill_context={"intent_keywords": ["分析"]}, + ) + assert result.passed is False # Due to required field + skill_match_checks = [c for c in result.checks if c.name == "skill_match"] + assert len(skill_match_checks) == 1 + assert skill_match_checks[0].passed is True # Not escalated + + @pytest.mark.asyncio + async def test_empty_intent_keywords_skips_check(self) -> None: + """Empty intent_keywords list → skill_match check skipped entirely.""" + gate = QualityGate() + skill = _make_skill() + result = await gate.validate( + {"content": "hello"}, + skill, + skill_context={"intent_keywords": []}, + ) + skill_match_checks = [c for c in result.checks if c.name == "skill_match"] + assert len(skill_match_checks) == 0 diff --git a/tests/unit/router/test_intent.py b/tests/unit/router/test_intent.py new file mode 100644 index 0000000..91e1ad8 --- /dev/null +++ b/tests/unit/router/test_intent.py @@ -0,0 +1,200 @@ +"""Unit tests for IntentRouter multi-candidate keyword scoring.""" + +from __future__ import annotations + +from agentkit.router.intent import IntentRouter +from agentkit.skills.base import Skill, SkillConfig + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_skill(name: str, keywords: list[str], description: str = "") -> Skill: + """Create a Skill with the given name and intent keywords.""" + config = SkillConfig( + name=name, + agent_type="skill", + description=description or f"Skill {name}", + task_mode="llm_generate", + prompt={"identity": f"You are {name}"}, + intent={"keywords": keywords, "description": description or f"Skill {name}"}, + ) + return Skill(config=config) + + +def _make_skills(*specs: tuple[str, list[str]]) -> list[Skill]: + """Create multiple skills from (name, keywords) tuples.""" + return [_make_skill(name, kws) for name, kws in specs] + + +# --------------------------------------------------------------------------- +# Tests: Single-candidate match (backward compatible) +# --------------------------------------------------------------------------- + + +class TestSingleCandidateMatch: + """When only one skill matches, behavior is identical to old first-match.""" + + def test_single_skill_matches(self) -> None: + router = IntentRouter() + skills = _make_skills(("skill_a", ["规划", "执行"]), ("skill_b", ["搜索", "查询"])) + result = router._match_keywords({"content": "帮我规划一个项目"}, skills) + assert result is not None + assert result.matched_skill == "skill_a" + assert result.method == "keyword" + + def test_single_keyword_match_confidence(self) -> None: + router = IntentRouter() + skills = _make_skills(("skill_a", ["规划"])) + result = router._match_keywords({"content": "规划任务"}, skills) + assert result is not None + # 1 keyword matched → confidence = min(1.0, 0.5 + 0.1 * 1) = 0.6 + assert result.confidence == 0.6 + + +# --------------------------------------------------------------------------- +# Tests: Multi-candidate scoring +# --------------------------------------------------------------------------- + + +class TestMultiCandidateScoring: + """When multiple skills match, the best-scored one wins.""" + + def test_longer_keyword_wins(self) -> None: + """'调研报告' (4 chars) beats '报告' (2 chars) on same input.""" + router = IntentRouter() + skills = _make_skills( + ("plan_exec", ["规划", "报告"]), + ("goal_driven", ["调研报告", "生成"]), + ) + result = router._match_keywords({"content": "规划一个调研报告"}, skills) + assert result is not None + # plan_exec: "规划"(2) + "报告"(2) = 4; goal_driven: "调研报告"(4) = 4 + # Same score → alphabetical: goal_driven < plan_exec + assert result.matched_skill == "goal_driven" + + def test_more_keywords_wins(self) -> None: + """Skill matching 3 keywords beats skill matching 1 keyword.""" + router = IntentRouter() + skills = _make_skills( + ("skill_a", ["分析"]), + ("skill_b", ["分析", "市场", "趋势"]), + ) + result = router._match_keywords({"content": "分析市场趋势"}, skills) + assert result is not None + # skill_a: "分析"(2) = 2; skill_b: "分析"(2)+"市场"(2)+"趋势"(2) = 6 + assert result.matched_skill == "skill_b" + + def test_same_score_alphabetical(self) -> None: + """When scores are equal, alphabetical name order breaks the tie.""" + router = IntentRouter() + skills = _make_skills( + ("zebra_skill", ["分析"]), + ("alpha_skill", ["分析"]), + ) + result = router._match_keywords({"content": "分析数据"}, skills) + assert result is not None + assert result.matched_skill == "alpha_skill" + + +# --------------------------------------------------------------------------- +# Tests: No match +# --------------------------------------------------------------------------- + + +class TestNoMatch: + def test_no_keyword_match(self) -> None: + router = IntentRouter() + skills = _make_skills(("skill_a", ["搜索"]), ("skill_b", ["查询"])) + result = router._match_keywords({"content": "你好"}, skills) + assert result is None + + def test_empty_keywords_list(self) -> None: + """Skill with empty keywords list does not participate in matching.""" + router = IntentRouter() + skills = [_make_skill("empty_kw", [])] + result = router._match_keywords({"content": "anything"}, skills) + assert result is None + + +# --------------------------------------------------------------------------- +# Tests: Case insensitivity +# --------------------------------------------------------------------------- + + +class TestCaseInsensitivity: + def test_english_keyword_case_insensitive(self) -> None: + router = IntentRouter() + skills = _make_skills(("skill_a", ["Search"])) + result = router._match_keywords({"content": "please search for this"}, skills) + assert result is not None + assert result.matched_skill == "skill_a" + + +# --------------------------------------------------------------------------- +# Tests: Substring matching +# --------------------------------------------------------------------------- + + +class TestSubstringMatch: + def test_chinese_substring_match(self) -> None: + """Chinese keyword '报告' should match input containing '报告'.""" + router = IntentRouter() + skills = _make_skills(("skill_a", ["报告"])) + result = router._match_keywords({"content": "生成一份报告"}, skills) + assert result is not None + assert result.matched_skill == "skill_a" + + +# --------------------------------------------------------------------------- +# Tests: Confidence calculation +# --------------------------------------------------------------------------- + + +class TestConfidenceCalculation: + def test_one_keyword_confidence(self) -> None: + router = IntentRouter() + skills = _make_skills(("skill_a", ["分析"])) + result = router._match_keywords({"content": "分析数据"}, skills) + assert result is not None + assert result.confidence == 0.6 # 0.5 + 0.1 * 1 + + def test_three_keywords_confidence(self) -> None: + router = IntentRouter() + skills = _make_skills(("skill_a", ["分析", "市场", "趋势"])) + result = router._match_keywords({"content": "分析市场趋势"}, skills) + assert result is not None + assert result.confidence == 0.8 # 0.5 + 0.1 * 3 + + def test_confidence_capped_at_one(self) -> None: + router = IntentRouter() + skills = _make_skills(("skill_a", ["a", "b", "c", "d", "e", "f"])) + result = router._match_keywords({"content": "a b c d e f"}, skills) + assert result is not None + assert result.confidence == 1.0 # min(1.0, 0.5 + 0.1 * 6 = 1.1) + + +# --------------------------------------------------------------------------- +# Tests: Edge cases +# --------------------------------------------------------------------------- + + +class TestEdgeCases: + def test_empty_input_text(self) -> None: + router = IntentRouter() + skills = _make_skills(("skill_a", ["分析"])) + result = router._match_keywords({"content": ""}, skills) + assert result is None + + def test_nested_input_data(self) -> None: + """_extract_string_values should handle nested dicts/lists.""" + router = IntentRouter() + skills = _make_skills(("skill_a", ["分析"])) + result = router._match_keywords( + {"message": {"text": "分析数据", "meta": {"role": "user"}}}, + skills, + ) + assert result is not None + assert result.matched_skill == "skill_a"