fischer-agentkit/tests/e2e/benchmark_dataset.py

1042 lines
39 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Agent Capability Benchmark — Ground Truth Dataset (v2).
Aligned with actual skills in configs/skills/*.yaml.
Contains both manually curated edge cases and auto-generated cases.
Categories:
- routing: intent routing correctness
- execution: execution mode selection accuracy
- team: expert team collaboration
- consistency: deterministic output consistency
- semantic_router: semantic similarity matching
- alignment: constraint compliance and cascade detection
"""
from pydantic import BaseModel, ConfigDict
class BenchmarkCase(BaseModel):
"""A single benchmark test case with ground truth label."""
model_config = ConfigDict(frozen=True)
id: str
input: str
expected_skill: str | None = None
expected_execution_mode: str = "direct"
expected_complexity: str = "low"
category: str
subcategory: str
paraphrases: list[str] = []
tags: list[str] = []
# ═══════════════════════════════════════════════════════════════════════════
# Routing — Keyword Match (aligned with actual skills)
# ═══════════════════════════════════════════════════════════════════════════
ROUTING_KEYWORD_BENCHMARKS: list[BenchmarkCase] = [
# direct_agent
BenchmarkCase(
id="route-kw-direct-001",
input="翻译这段话",
expected_skill="direct_agent",
expected_execution_mode="direct",
expected_complexity="low",
category="routing",
subcategory="keyword_match",
paraphrases=["帮我翻译一下", "请翻译这段内容", "Translate this text"],
tags=["翻译", "translate"],
),
BenchmarkCase(
id="route-kw-direct-002",
input="帮我总结一下",
expected_skill="direct_agent",
expected_execution_mode="direct",
expected_complexity="low",
category="routing",
subcategory="keyword_match",
paraphrases=["请总结", "给我一个摘要", "Summarize this"],
tags=["摘要", "summarize"],
),
BenchmarkCase(
id="route-kw-direct-003",
input="什么是RAG",
expected_skill="direct_agent",
expected_execution_mode="direct",
expected_complexity="low",
category="routing",
subcategory="keyword_match",
paraphrases=["RAG是什么", "解释一下RAG", "What is RAG?"],
tags=["什么是"],
),
# react_agent
BenchmarkCase(
id="route-kw-react-001",
input="搜索一下AI Agent市场数据",
expected_skill="react_agent",
expected_execution_mode="react",
expected_complexity="high",
category="routing",
subcategory="keyword_match",
paraphrases=[
"帮我搜索AI Agent市场信息",
"查找AI Agent的市场数据",
"Search AI Agent market data",
],
tags=["搜索", "search"],
),
BenchmarkCase(
id="route-kw-react-002",
input="帮我分析这个数据",
expected_skill="react_agent",
expected_execution_mode="react",
expected_complexity="high",
category="routing",
subcategory="keyword_match",
paraphrases=["分析一下这些数据", "请对数据做分析", "Analyze this data"],
tags=["分析", "analyze"],
),
BenchmarkCase(
id="route-kw-react-003",
input="实时监控竞品动态",
expected_skill="react_agent",
expected_execution_mode="react",
expected_complexity="high",
category="routing",
subcategory="keyword_match",
paraphrases=["监控竞争对手的动态", "实时追踪竞品变化", "Monitor competitor activities"],
tags=["实时", "监控"],
),
# rewoo_agent
BenchmarkCase(
id="route-kw-rewoo-001",
input="采集A、B、C三个竞品的功能数据",
expected_skill="rewoo_agent",
expected_execution_mode="rewoo",
expected_complexity="high",
category="routing",
subcategory="keyword_match",
paraphrases=[
"批量采集竞品数据",
"并行获取多个竞品信息",
"Fetch data from multiple competitors",
],
tags=["采集", "批量", "fetch"],
),
BenchmarkCase(
id="route-kw-rewoo-002",
input="并行搜索多个关键词",
expected_skill="rewoo_agent",
expected_execution_mode="rewoo",
expected_complexity="high",
category="routing",
subcategory="keyword_match",
paraphrases=["同时搜索多个关键词", "批量搜索", "Search multiple keywords in parallel"],
tags=["并行", "批量"],
),
# reflexion_agent
BenchmarkCase(
id="route-kw-reflex-001",
input="审查这段代码的合规性",
expected_skill="reflexion_agent",
expected_execution_mode="reflexion",
expected_complexity="high",
category="routing",
subcategory="keyword_match",
paraphrases=["检查代码是否合规", "审查代码合规问题", "Review code compliance"],
tags=["审查", "合规", "review"],
),
BenchmarkCase(
id="route-kw-reflex-002",
input="生成一个高精度的数据分析脚本",
expected_skill="reflexion_agent",
expected_execution_mode="reflexion",
expected_complexity="high",
category="routing",
subcategory="keyword_match",
paraphrases=[
"写一个精确的数据分析脚本",
"生成高精度分析代码",
"Generate a precise analysis script",
],
tags=["代码生成", "精确", "code"],
),
# plan_exec_agent
BenchmarkCase(
id="route-kw-planexec-001",
input="生成一份市场分析报告",
expected_skill="plan_exec_agent",
expected_execution_mode="plan_exec",
expected_complexity="high",
category="routing",
subcategory="keyword_match",
paraphrases=["做一份市场分析报告", "写个市场分析报告", "Generate a market analysis report"],
tags=["报告", "分析报告"],
),
BenchmarkCase(
id="route-kw-planexec-002",
input="规划产品优化方案",
expected_skill="plan_exec_agent",
expected_execution_mode="plan_exec",
expected_complexity="high",
category="routing",
subcategory="keyword_match",
paraphrases=["制定产品优化计划", "帮我规划产品优化", "Plan product optimization"],
tags=["规划", "plan"],
),
# code_reviewer
BenchmarkCase(
id="route-kw-coderev-001",
input="Review this code for quality",
expected_skill="code_reviewer",
expected_execution_mode="direct",
expected_complexity="low",
category="routing",
subcategory="keyword_match",
paraphrases=["审查这段代码的质量", "代码审查", "Check code quality"],
tags=["review", "代码审查"],
),
# geo_optimizer
BenchmarkCase(
id="route-kw-geo-001",
input="帮我优化这篇文章的SEO",
expected_skill="geo_optimizer",
expected_execution_mode="llm_generate",
expected_complexity="low",
category="routing",
subcategory="keyword_match",
paraphrases=["SEO优化一下", "提升文章搜索排名", "Optimize this article for SEO"],
tags=["SEO优化", "optimize"],
),
# deai_agent
BenchmarkCase(
id="route-kw-deai-001",
input="帮我把这篇文章去AI化",
expected_skill="deai_agent",
expected_execution_mode="llm_generate",
expected_complexity="low",
category="routing",
subcategory="keyword_match",
paraphrases=["让这段文字更自然", "改写得像人写的", "Make this text more natural"],
tags=["去AI化", "人性化"],
),
# content_generator
BenchmarkCase(
id="route-kw-content-001",
input="帮我写一篇关于AI的文章",
expected_skill="content_generator",
expected_execution_mode="llm_generate",
expected_complexity="low",
category="routing",
subcategory="keyword_match",
paraphrases=["写一篇AI相关文章", "生成关于AI的内容", "Write an article about AI"],
tags=["写文章", "generate"],
),
# citation_detector
BenchmarkCase(
id="route-kw-citation-001",
input="检测我们的品牌在AI平台的引用情况",
expected_skill="citation_detector",
expected_execution_mode="custom",
expected_complexity="medium",
category="routing",
subcategory="keyword_match",
paraphrases=[
"分析品牌引用率",
"哪些AI平台引用了我们",
"Check brand citation on AI platforms",
],
tags=["引用检测", "citation"],
),
# trend_agent
BenchmarkCase(
id="route-kw-trend-001",
input="分析品牌趋势",
expected_skill="trend_agent",
expected_execution_mode="tool_call",
expected_complexity="medium",
category="routing",
subcategory="keyword_match",
paraphrases=["最近的热点话题是什么", "趋势洞察", "Analyze brand trends"],
tags=["趋势", "trend"],
),
# competitor_analyzer
BenchmarkCase(
id="route-kw-competitor-001",
input="分析我的竞品策略",
expected_skill="competitor_analyzer",
expected_execution_mode="tool_call",
expected_complexity="medium",
category="routing",
subcategory="keyword_match",
paraphrases=["对比我和竞品的差距", "竞品分析", "Analyze competitor strategies"],
tags=["竞品", "competitor"],
),
# schema_advisor
BenchmarkCase(
id="route-kw-schema-001",
input="帮我优化Schema",
expected_skill="schema_advisor",
expected_execution_mode="custom",
expected_complexity="medium",
category="routing",
subcategory="keyword_match",
paraphrases=["生成JSON-LD结构化数据", "Schema有什么可以改进的", "Optimize my Schema"],
tags=["Schema", "schema优化"],
),
# monitor
BenchmarkCase(
id="route-kw-monitor-001",
input="监测品牌引用变化",
expected_skill="monitor",
expected_execution_mode="custom",
expected_complexity="medium",
category="routing",
subcategory="keyword_match",
paraphrases=["追踪效果", "品牌排名变化", "Monitor brand citation changes"],
tags=["监测", "monitor"],
),
# goal_driven_agent
BenchmarkCase(
id="route-kw-goal-001",
input="分析竞品SEO策略并生成优化方案",
expected_skill="goal_driven_agent",
expected_execution_mode="tool_call",
expected_complexity="medium",
category="routing",
subcategory="keyword_match",
paraphrases=[
"调研技术方案并生成对比报告",
"制定市场推广计划",
"Analyze SEO and generate plan",
],
tags=["分析", "优化方案"],
),
]
# ═══════════════════════════════════════════════════════════════════════════
# Routing — Edge Cases (manually curated)
# ═══════════════════════════════════════════════════════════════════════════
ROUTING_EDGE_BENCHMARKS: list[BenchmarkCase] = [
# Greeting (should NOT route to any skill)
BenchmarkCase(
id="route-edge-greet-001",
input="你好",
expected_skill=None,
expected_execution_mode="direct",
expected_complexity="low",
category="routing",
subcategory="greeting",
paraphrases=["Hello", "Hi there", "早上好"],
tags=["greeting"],
),
BenchmarkCase(
id="route-edge-greet-002",
input="Good morning!",
expected_skill=None,
expected_execution_mode="direct",
expected_complexity="low",
category="routing",
subcategory="greeting",
paraphrases=["早上好!", "你好呀"],
tags=["greeting"],
),
# Identity (should NOT route to any skill)
BenchmarkCase(
id="route-edge-identity-001",
input="你是谁?",
expected_skill=None,
expected_execution_mode="direct",
expected_complexity="low",
category="routing",
subcategory="identity",
paraphrases=["What is your name?", "介绍一下你自己", "Tell me about yourself"],
tags=["identity"],
),
# Explicit prefix
BenchmarkCase(
id="route-edge-explicit-001",
input="@skill:react_agent 搜索最新的AI新闻",
expected_skill="react_agent",
expected_execution_mode="react",
expected_complexity="high",
category="routing",
subcategory="explicit_prefix",
paraphrases=["@skill:react_agent 查找AI最新动态"],
tags=["explicit", "react"],
),
# Fallback (no matching skill)
BenchmarkCase(
id="route-edge-fallback-001",
input="告诉我一个笑话",
expected_skill=None,
expected_execution_mode="direct",
expected_complexity="low",
category="routing",
subcategory="fallback",
paraphrases=["讲个笑话", "Tell me a joke", "说个搞笑的"],
tags=["fallback"],
),
BenchmarkCase(
id="route-edge-fallback-002",
input="What is quantum physics?",
expected_skill=None,
expected_execution_mode="direct",
expected_complexity="low",
category="routing",
subcategory="fallback",
paraphrases=["量子物理是什么", "Explain quantum mechanics"],
tags=["fallback"],
),
# Disambiguation (multiple skills could match)
BenchmarkCase(
id="route-edge-disambig-001",
input="审查代码并优化SEO",
expected_skill="code_reviewer",
expected_execution_mode="direct",
expected_complexity="low",
category="routing",
subcategory="disambiguation",
paraphrases=["Review code and optimize SEO", "代码审查加SEO优化"],
tags=["disambiguation", "review", "seo"],
),
]
# ═══════════════════════════════════════════════════════════════════════════
# Execution Mode Benchmarks
# ═══════════════════════════════════════════════════════════════════════════
EXECUTION_BENCHMARKS: list[BenchmarkCase] = [
BenchmarkCase(
id="exec-direct-001",
input="翻译这段话成英文",
expected_skill="direct_agent",
expected_execution_mode="direct",
expected_complexity="low",
category="execution",
subcategory="direct_mode",
paraphrases=["Translate this to English", "把这段翻成英语"],
tags=["direct", "simple"],
),
BenchmarkCase(
id="exec-direct-002",
input="什么是AgentKit",
expected_skill="direct_agent",
expected_execution_mode="direct",
expected_complexity="low",
category="execution",
subcategory="direct_mode",
paraphrases=["AgentKit是什么", "Explain AgentKit"],
tags=["direct", "qa"],
),
BenchmarkCase(
id="exec-react-001",
input="搜索并分析AI行业最新趋势",
expected_skill="react_agent",
expected_execution_mode="react",
expected_complexity="high",
category="execution",
subcategory="react_mode",
paraphrases=["Search and analyze AI trends", "调研AI行业趋势"],
tags=["react", "multi_step"],
),
BenchmarkCase(
id="exec-react-002",
input="实时监控竞品动态并生成报告",
expected_skill="react_agent",
expected_execution_mode="react",
expected_complexity="high",
category="execution",
subcategory="react_mode",
paraphrases=["Monitor competitors and report", "追踪竞品并输出报告"],
tags=["react", "monitoring"],
),
BenchmarkCase(
id="exec-rewoo-001",
input="批量采集多个竞品的功能数据",
expected_skill="rewoo_agent",
expected_execution_mode="rewoo",
expected_complexity="high",
category="execution",
subcategory="rewoo_mode",
paraphrases=["并行获取竞品数据", "Fetch competitor data in parallel"],
tags=["rewoo", "parallel"],
),
BenchmarkCase(
id="exec-reflexion-001",
input="审查代码合规性并确保高精度",
expected_skill="reflexion_agent",
expected_execution_mode="reflexion",
expected_complexity="high",
category="execution",
subcategory="reflexion_mode",
paraphrases=["高精度代码审查", "Precise code compliance review"],
tags=["reflexion", "precision"],
),
BenchmarkCase(
id="exec-planexec-001",
input="生成一份完整的市场调研报告",
expected_skill="plan_exec_agent",
expected_execution_mode="plan_exec",
expected_complexity="high",
category="execution",
subcategory="plan_exec_mode",
paraphrases=["做一份市场调研报告", "Generate a market research report"],
tags=["plan_exec", "report"],
),
BenchmarkCase(
id="exec-quality-001",
input="生成内容并确保质量达标",
expected_skill="content_generator",
expected_execution_mode="llm_generate",
expected_complexity="low",
category="execution",
subcategory="quality_gate",
paraphrases=["生成高质量内容", "Generate quality content"],
tags=["quality", "content"],
),
]
# ═══════════════════════════════════════════════════════════════════════════
# Team Collaboration Benchmarks
# ═══════════════════════════════════════════════════════════════════════════
TEAM_BENCHMARKS: list[BenchmarkCase] = [
BenchmarkCase(
id="team-explicit-001",
input="@team:react_agent,plan_exec_agent 协作完成深度分析并生成报告",
expected_execution_mode="react",
expected_complexity="high",
category="team",
subcategory="explicit_team",
paraphrases=[
"需要react_agent和plan_exec_agent协作",
"组建团队:搜索分析+报告生成",
],
tags=["team", "explicit"],
),
BenchmarkCase(
id="team-explicit-002",
input="@team:competitor_analyzer,trend_agent 分析竞品并追踪趋势",
expected_execution_mode="react",
expected_complexity="high",
category="team",
subcategory="explicit_team",
paraphrases=["竞品分析+趋势追踪团队", "Team for competitor and trend analysis"],
tags=["team", "explicit"],
),
BenchmarkCase(
id="team-complexity-001",
input="深度分析竞品策略、追踪品牌趋势并生成优化方案",
expected_execution_mode="react",
expected_complexity="high",
category="team",
subcategory="complexity_trigger",
paraphrases=[
"全面竞品分析和优化方案",
"Comprehensive competitor analysis with optimization",
],
tags=["team", "complexity"],
),
BenchmarkCase(
id="team-fallback-001",
input="复杂任务但无匹配专家",
expected_execution_mode="react",
expected_complexity="high",
category="team",
subcategory="fallback",
paraphrases=["需要团队但找不到合适专家", "Complex task without matching experts"],
tags=["team", "fallback"],
),
BenchmarkCase(
id="team-name-valid-001",
input="@team:react_agent,plan_exec_agent",
expected_execution_mode="react",
expected_complexity="high",
category="team",
subcategory="name_validation",
tags=["team", "validation"],
),
BenchmarkCase(
id="team-name-invalid-001",
input="@team:invalid expert name",
expected_execution_mode="direct",
expected_complexity="low",
category="team",
subcategory="name_validation",
tags=["team", "validation", "invalid"],
),
]
# ═══════════════════════════════════════════════════════════════════════════
# Consistency Benchmarks
# ═══════════════════════════════════════════════════════════════════════════
CONSISTENCY_BENCHMARKS: list[BenchmarkCase] = [
BenchmarkCase(
id="consist-direct-001",
input="翻译'hello world'成中文",
expected_skill="direct_agent",
expected_execution_mode="direct",
expected_complexity="low",
category="consistency",
subcategory="deterministic",
tags=["consistency", "translation"],
),
BenchmarkCase(
id="consist-direct-002",
input="什么是RAG",
expected_skill="direct_agent",
expected_execution_mode="direct",
expected_complexity="low",
category="consistency",
subcategory="deterministic",
tags=["consistency", "qa"],
),
BenchmarkCase(
id="consist-react-001",
input="搜索AI Agent市场数据",
expected_skill="react_agent",
expected_execution_mode="react",
expected_complexity="high",
category="consistency",
subcategory="deterministic",
tags=["consistency", "search"],
),
BenchmarkCase(
id="consist-geo-001",
input="帮我优化这篇文章的SEO",
expected_skill="geo_optimizer",
expected_execution_mode="llm_generate",
expected_complexity="low",
category="consistency",
subcategory="deterministic",
tags=["consistency", "seo"],
),
BenchmarkCase(
id="consist-deai-001",
input="帮我把这篇文章去AI化",
expected_skill="deai_agent",
expected_execution_mode="llm_generate",
expected_complexity="low",
category="consistency",
subcategory="deterministic",
tags=["consistency", "deai"],
),
]
# ═══════════════════════════════════════════════════════════════════════════
# Semantic Router Benchmarks
# ═══════════════════════════════════════════════════════════════════════════
SEMANTIC_ROUTER_BENCHMARKS: list[BenchmarkCase] = [
BenchmarkCase(
id="semantic-direct-001",
input="简单生成任务,无需工具调用",
expected_skill="direct_agent",
expected_execution_mode="direct",
expected_complexity="low",
category="semantic_router",
subcategory="description_match",
paraphrases=["只需要一次生成的简单任务", "Single LLM call task"],
tags=["semantic", "direct"],
),
BenchmarkCase(
id="semantic-react-001",
input="需要动态适应、逐步推理和工具调用",
expected_skill="react_agent",
expected_execution_mode="react",
expected_complexity="high",
category="semantic_router",
subcategory="description_match",
paraphrases=["需要多步推理和工具", "Multi-step reasoning with tools"],
tags=["semantic", "react"],
),
BenchmarkCase(
id="semantic-rewoo-001",
input="多源数据并行采集、无依赖工具调用批量执行",
expected_skill="rewoo_agent",
expected_execution_mode="rewoo",
expected_complexity="high",
category="semantic_router",
subcategory="description_match",
paraphrases=["并行批量获取数据", "Parallel data collection"],
tags=["semantic", "rewoo"],
),
BenchmarkCase(
id="semantic-reflex-001",
input="需要高精度和自我验证的任务",
expected_skill="reflexion_agent",
expected_execution_mode="reflexion",
expected_complexity="high",
category="semantic_router",
subcategory="description_match",
paraphrases=["需要自我检查的高精度任务", "High-precision self-verification task"],
tags=["semantic", "reflexion"],
),
BenchmarkCase(
id="semantic-planexec-001",
input="结构化多步骤任务,需要可审查的规划和执行",
expected_skill="plan_exec_agent",
expected_execution_mode="plan_exec",
expected_complexity="high",
category="semantic_router",
subcategory="description_match",
paraphrases=["需要先规划再执行的任务", "Structured planning and execution"],
tags=["semantic", "plan_exec"],
),
BenchmarkCase(
id="semantic-geo-001",
input="对文章进行GEO/SEO优化提升在AI搜索引擎中的可见性",
expected_skill="geo_optimizer",
expected_execution_mode="llm_generate",
expected_complexity="low",
category="semantic_router",
subcategory="description_match",
paraphrases=["提升内容搜索排名", "Improve content visibility in AI search"],
tags=["semantic", "geo"],
),
BenchmarkCase(
id="semantic-citation-001",
input="检测品牌在各AI平台回答中的引用情况",
expected_skill="citation_detector",
expected_execution_mode="custom",
expected_complexity="medium",
category="semantic_router",
subcategory="description_match",
paraphrases=["分析品牌被AI引用的情况", "Check brand citation across AI platforms"],
tags=["semantic", "citation"],
),
BenchmarkCase(
id="semantic-competitor-001",
input="分析竞品策略、对比品牌差距或发现竞争机会",
expected_skill="competitor_analyzer",
expected_execution_mode="tool_call",
expected_complexity="medium",
category="semantic_router",
subcategory="description_match",
paraphrases=["竞品对比和差距分析", "Competitive gap analysis"],
tags=["semantic", "competitor"],
),
# --- Colloquial / casual expressions (口语化表达) ---
BenchmarkCase(
id="semantic-colloquial-review-001",
input="帮我看看代码有没有问题",
expected_skill="code_reviewer",
expected_execution_mode="react",
expected_complexity="medium",
category="semantic_router",
subcategory="colloquial_match",
paraphrases=["代码审查一下", "Check my code for issues"],
tags=["semantic", "colloquial", "code_review"],
),
BenchmarkCase(
id="semantic-colloquial-trend-001",
input="最近市场行情怎么样",
expected_skill="trend_agent",
expected_execution_mode="tool_call",
expected_complexity="medium",
category="semantic_router",
subcategory="colloquial_match",
paraphrases=["市场走势如何", "What's the market trend"],
tags=["semantic", "colloquial", "trend"],
),
BenchmarkCase(
id="semantic-colloquial-content-001",
input="帮我写点东西",
expected_skill="content_generator",
expected_execution_mode="llm_generate",
expected_complexity="low",
category="semantic_router",
subcategory="colloquial_match",
paraphrases=["写篇文章吧", "Write something for me"],
tags=["semantic", "colloquial", "content"],
),
BenchmarkCase(
id="semantic-colloquial-citation-001",
input="这个引用对不对",
expected_skill="citation_detector",
expected_execution_mode="custom",
expected_complexity="medium",
category="semantic_router",
subcategory="colloquial_match",
paraphrases=["查查引用准不准", "Are these citations correct"],
tags=["semantic", "colloquial", "citation"],
),
BenchmarkCase(
id="semantic-colloquial-competitor-001",
input="对手怎么样",
expected_skill="competitor_analyzer",
expected_execution_mode="tool_call",
expected_complexity="medium",
category="semantic_router",
subcategory="colloquial_match",
paraphrases=["竞品啥情况", "How are competitors doing"],
tags=["semantic", "colloquial", "competitor"],
),
# --- Mixed Chinese-English expressions (中英混合) ---
BenchmarkCase(
id="semantic-mixed-review-001",
input="review一下这段代码",
expected_skill="code_reviewer",
expected_execution_mode="react",
expected_complexity="medium",
category="semantic_router",
subcategory="mixed_lang_match",
paraphrases=["帮我review代码", "Code review please"],
tags=["semantic", "mixed", "code_review"],
),
BenchmarkCase(
id="semantic-mixed-geo-001",
input="做个SEO优化",
expected_skill="geo_optimizer",
expected_execution_mode="llm_generate",
expected_complexity="low",
category="semantic_router",
subcategory="mixed_lang_match",
paraphrases=["GEO优化一下", "Optimize for AI search"],
tags=["semantic", "mixed", "geo"],
),
BenchmarkCase(
id="semantic-mixed-monitor-001",
input="monitor一下系统状态",
expected_skill="monitor",
expected_execution_mode="tool_call",
expected_complexity="medium",
category="semantic_router",
subcategory="mixed_lang_match",
paraphrases=["监控系统运行", "Monitor system status"],
tags=["semantic", "mixed", "monitor"],
),
]
# ═══════════════════════════════════════════════════════════════════════════
# Alignment Guard Benchmarks
# ═══════════════════════════════════════════════════════════════════════════
ALIGNMENT_BENCHMARKS: list[BenchmarkCase] = [
BenchmarkCase(
id="align-negative-001",
input="写一篇产品介绍,不要提及价格",
expected_skill="content_generator",
expected_execution_mode="llm_generate",
expected_complexity="low",
category="alignment",
subcategory="negative_constraint",
tags=["alignment", "negative_constraint"],
),
BenchmarkCase(
id="align-positive-001",
input="生成报告,必须包含摘要部分",
expected_skill="plan_exec_agent",
expected_execution_mode="plan_exec",
expected_complexity="high",
category="alignment",
subcategory="positive_constraint",
tags=["alignment", "positive_constraint"],
),
BenchmarkCase(
id="align-cascade-001",
input="反复搜索相同关键词",
expected_skill="react_agent",
expected_execution_mode="react",
expected_complexity="high",
category="alignment",
subcategory="cascade_detection",
tags=["alignment", "cascade"],
),
BenchmarkCase(
id="align-no-constraint-001",
input="帮我写一篇文章",
expected_skill="content_generator",
expected_execution_mode="llm_generate",
expected_complexity="low",
category="alignment",
subcategory="no_constraint",
tags=["alignment", "baseline"],
),
BenchmarkCase(
id="align-combined-001",
input="生成竞品分析报告,必须包含对比表格,不要提及内部数据",
expected_skill="competitor_analyzer",
expected_execution_mode="tool_call",
expected_complexity="medium",
category="alignment",
subcategory="combined_constraint",
tags=["alignment", "combined"],
),
]
# ═══════════════════════════════════════════════════════════════════════════
# Board Meeting (Private Board) Benchmarks — @board prefix routing
# ═══════════════════════════════════════════════════════════════════════════
BOARD_BENCHMARKS: list[BenchmarkCase] = [
# --- Default template (@board without experts) ---
BenchmarkCase(
id="board-default-001",
input="@board 讨论是否应该进入东南亚市场",
expected_execution_mode="board",
expected_complexity="high",
category="board",
subcategory="default_template",
paraphrases=[
"@board 我们要不要拓展东南亚业务",
"@board 东南亚市场进入策略讨论",
"@board:private_board 评估东南亚市场机会",
],
tags=["board", "default", "strategy"],
),
BenchmarkCase(
id="board-default-002",
input="@board AI产品定价策略应该怎么做",
expected_execution_mode="board",
expected_complexity="high",
category="board",
subcategory="default_template",
paraphrases=["@board 如何给AI产品定价", "@board AI产品定价讨论"],
tags=["board", "default", "pricing"],
),
# --- Explicit expert list (@board:expert1,expert2) ---
BenchmarkCase(
id="board-explicit-001",
input="@board:elon_musk,jeff_bezos 讨论火星殖民的商业化路径",
expected_execution_mode="board",
expected_complexity="high",
category="board",
subcategory="explicit_experts",
paraphrases=[
"@board:elon_musk,jeff_bezos 火星商业化方案",
"@board:jeff_bezos,elon_musk 如何商业化火星",
],
tags=["board", "explicit", "mars"],
),
BenchmarkCase(
id="board-explicit-002",
input="@board:charlie_munger,warren_buffett 价值投资在AI时代的适用性",
expected_execution_mode="board",
expected_complexity="high",
category="board",
subcategory="explicit_experts",
paraphrases=[
"@board:charlie_munger,warren_buffett AI时代还要不要价值投资",
],
tags=["board", "explicit", "investing"],
),
# --- Explicit default template name ---
BenchmarkCase(
id="board-template-001",
input="@board:private_board 讨论创业公司融资节奏",
expected_execution_mode="board",
expected_complexity="high",
category="board",
subcategory="explicit_template",
paraphrases=["@board:private_board 创业融资策略", "@board:private_board 融资节奏讨论"],
tags=["board", "template", "fundraising"],
),
# --- Edge cases ---
BenchmarkCase(
id="board-edge-empty-topic-001",
input="@board",
expected_execution_mode="board",
expected_complexity="low",
category="board",
subcategory="empty_topic",
tags=["board", "edge", "empty"],
),
BenchmarkCase(
id="board-edge-no-prefix-001",
input="讨论一下市场策略",
expected_execution_mode="react",
expected_complexity="medium",
category="board",
subcategory="no_prefix",
paraphrases=["分析市场策略", "市场策略讨论"],
tags=["board", "edge", "no_match"],
),
# --- Name validation ---
BenchmarkCase(
id="board-name-valid-001",
input="@board:elon_musk,jeff_bezos,allenzhang 产品设计哲学",
expected_execution_mode="board",
expected_complexity="high",
category="board",
subcategory="name_validation",
tags=["board", "validation", "valid"],
),
BenchmarkCase(
id="board-name-invalid-001",
input="@board:@#$ 讨论主题",
expected_execution_mode="board",
expected_complexity="low",
category="board",
subcategory="name_validation",
tags=["board", "validation", "invalid"],
),
# --- Stop command (user intervention) ---
BenchmarkCase(
id="board-stop-001",
input="/stop",
expected_execution_mode="board",
expected_complexity="low",
category="board",
subcategory="stop_command",
paraphrases=["停止讨论", "结束讨论"],
tags=["board", "stop", "intervention"],
),
]
# ═══════════════════════════════════════════════════════════════════════════
# All benchmarks combined
# ═══════════════════════════════════════════════════════════════════════════
ALL_BENCHMARKS: list[BenchmarkCase] = (
ROUTING_KEYWORD_BENCHMARKS
+ ROUTING_EDGE_BENCHMARKS
+ EXECUTION_BENCHMARKS
+ TEAM_BENCHMARKS
+ CONSISTENCY_BENCHMARKS
+ SEMANTIC_ROUTER_BENCHMARKS
+ ALIGNMENT_BENCHMARKS
+ BOARD_BENCHMARKS
)
def get_benchmarks_by_category(category: str) -> list[BenchmarkCase]:
"""Filter benchmarks by category."""
return [b for b in ALL_BENCHMARKS if b.category == category]
def get_benchmarks_by_subcategory(subcategory: str) -> list[BenchmarkCase]:
"""Filter benchmarks by subcategory."""
return [b for b in ALL_BENCHMARKS if b.subcategory == subcategory]
def get_benchmarks_with_paraphrases() -> list[BenchmarkCase]:
"""Get only benchmarks that have paraphrases (for overfitting detection)."""
return [b for b in ALL_BENCHMARKS if b.paraphrases]
def get_skill_names_needed() -> set[str]:
"""Get all skill names referenced in benchmarks (for pre-registration)."""
return {b.expected_skill for b in ALL_BENCHMARKS if b.expected_skill is not None}
def get_benchmark_stats() -> dict[str, int]:
"""Get benchmark count by category."""
stats: dict[str, int] = {}
for b in ALL_BENCHMARKS:
stats[b.category] = stats.get(b.category, 0) + 1
stats["total"] = len(ALL_BENCHMARKS)
return stats