"""Agent Capability Benchmark — Ground Truth Dataset (v2). Aligned with actual skills in configs/skills/*.yaml. Contains both manually curated edge cases and auto-generated cases. Categories: - routing: intent routing correctness - execution: execution mode selection accuracy - team: expert team collaboration - consistency: deterministic output consistency - semantic_router: semantic similarity matching - alignment: constraint compliance and cascade detection """ from pydantic import BaseModel, ConfigDict class BenchmarkCase(BaseModel): """A single benchmark test case with ground truth label.""" model_config = ConfigDict(frozen=True) id: str input: str expected_skill: str | None = None expected_execution_mode: str = "direct" expected_complexity: str = "low" category: str subcategory: str paraphrases: list[str] = [] tags: list[str] = [] # ═══════════════════════════════════════════════════════════════════════════ # Routing — Keyword Match (aligned with actual skills) # ═══════════════════════════════════════════════════════════════════════════ ROUTING_KEYWORD_BENCHMARKS: list[BenchmarkCase] = [ # direct_agent BenchmarkCase( id="route-kw-direct-001", input="翻译这段话", expected_skill="direct_agent", expected_execution_mode="direct", expected_complexity="low", category="routing", subcategory="keyword_match", paraphrases=["帮我翻译一下", "请翻译这段内容", "Translate this text"], tags=["翻译", "translate"], ), BenchmarkCase( id="route-kw-direct-002", input="帮我总结一下", expected_skill="direct_agent", expected_execution_mode="direct", expected_complexity="low", category="routing", subcategory="keyword_match", paraphrases=["请总结", "给我一个摘要", "Summarize this"], tags=["摘要", "summarize"], ), BenchmarkCase( id="route-kw-direct-003", input="什么是RAG?", expected_skill="direct_agent", expected_execution_mode="direct", expected_complexity="low", category="routing", subcategory="keyword_match", paraphrases=["RAG是什么", "解释一下RAG", "What is RAG?"], tags=["什么是"], ), # react_agent BenchmarkCase( id="route-kw-react-001", input="搜索一下AI Agent市场数据", expected_skill="react_agent", expected_execution_mode="react", expected_complexity="high", category="routing", subcategory="keyword_match", paraphrases=[ "帮我搜索AI Agent市场信息", "查找AI Agent的市场数据", "Search AI Agent market data", ], tags=["搜索", "search"], ), BenchmarkCase( id="route-kw-react-002", input="帮我分析这个数据", expected_skill="react_agent", expected_execution_mode="react", expected_complexity="high", category="routing", subcategory="keyword_match", paraphrases=["分析一下这些数据", "请对数据做分析", "Analyze this data"], tags=["分析", "analyze"], ), BenchmarkCase( id="route-kw-react-003", input="实时监控竞品动态", expected_skill="react_agent", expected_execution_mode="react", expected_complexity="high", category="routing", subcategory="keyword_match", paraphrases=["监控竞争对手的动态", "实时追踪竞品变化", "Monitor competitor activities"], tags=["实时", "监控"], ), # rewoo_agent BenchmarkCase( id="route-kw-rewoo-001", input="采集A、B、C三个竞品的功能数据", expected_skill="rewoo_agent", expected_execution_mode="rewoo", expected_complexity="high", category="routing", subcategory="keyword_match", paraphrases=[ "批量采集竞品数据", "并行获取多个竞品信息", "Fetch data from multiple competitors", ], tags=["采集", "批量", "fetch"], ), BenchmarkCase( id="route-kw-rewoo-002", input="并行搜索多个关键词", expected_skill="rewoo_agent", expected_execution_mode="rewoo", expected_complexity="high", category="routing", subcategory="keyword_match", paraphrases=["同时搜索多个关键词", "批量搜索", "Search multiple keywords in parallel"], tags=["并行", "批量"], ), # reflexion_agent BenchmarkCase( id="route-kw-reflex-001", input="审查这段代码的合规性", expected_skill="reflexion_agent", expected_execution_mode="reflexion", expected_complexity="high", category="routing", subcategory="keyword_match", paraphrases=["检查代码是否合规", "审查代码合规问题", "Review code compliance"], tags=["审查", "合规", "review"], ), BenchmarkCase( id="route-kw-reflex-002", input="生成一个高精度的数据分析脚本", expected_skill="reflexion_agent", expected_execution_mode="reflexion", expected_complexity="high", category="routing", subcategory="keyword_match", paraphrases=[ "写一个精确的数据分析脚本", "生成高精度分析代码", "Generate a precise analysis script", ], tags=["代码生成", "精确", "code"], ), # plan_exec_agent BenchmarkCase( id="route-kw-planexec-001", input="生成一份市场分析报告", expected_skill="plan_exec_agent", expected_execution_mode="plan_exec", expected_complexity="high", category="routing", subcategory="keyword_match", paraphrases=["做一份市场分析报告", "写个市场分析报告", "Generate a market analysis report"], tags=["报告", "分析报告"], ), BenchmarkCase( id="route-kw-planexec-002", input="规划产品优化方案", expected_skill="plan_exec_agent", expected_execution_mode="plan_exec", expected_complexity="high", category="routing", subcategory="keyword_match", paraphrases=["制定产品优化计划", "帮我规划产品优化", "Plan product optimization"], tags=["规划", "plan"], ), # code_reviewer BenchmarkCase( id="route-kw-coderev-001", input="Review this code for quality", expected_skill="code_reviewer", expected_execution_mode="direct", expected_complexity="low", category="routing", subcategory="keyword_match", paraphrases=["审查这段代码的质量", "代码审查", "Check code quality"], tags=["review", "代码审查"], ), # geo_optimizer BenchmarkCase( id="route-kw-geo-001", input="帮我优化这篇文章的SEO", expected_skill="geo_optimizer", expected_execution_mode="llm_generate", expected_complexity="low", category="routing", subcategory="keyword_match", paraphrases=["SEO优化一下", "提升文章搜索排名", "Optimize this article for SEO"], tags=["SEO优化", "optimize"], ), # deai_agent BenchmarkCase( id="route-kw-deai-001", input="帮我把这篇文章去AI化", expected_skill="deai_agent", expected_execution_mode="llm_generate", expected_complexity="low", category="routing", subcategory="keyword_match", paraphrases=["让这段文字更自然", "改写得像人写的", "Make this text more natural"], tags=["去AI化", "人性化"], ), # content_generator BenchmarkCase( id="route-kw-content-001", input="帮我写一篇关于AI的文章", expected_skill="content_generator", expected_execution_mode="llm_generate", expected_complexity="low", category="routing", subcategory="keyword_match", paraphrases=["写一篇AI相关文章", "生成关于AI的内容", "Write an article about AI"], tags=["写文章", "generate"], ), # citation_detector BenchmarkCase( id="route-kw-citation-001", input="检测我们的品牌在AI平台的引用情况", expected_skill="citation_detector", expected_execution_mode="custom", expected_complexity="medium", category="routing", subcategory="keyword_match", paraphrases=[ "分析品牌引用率", "哪些AI平台引用了我们", "Check brand citation on AI platforms", ], tags=["引用检测", "citation"], ), # trend_agent BenchmarkCase( id="route-kw-trend-001", input="分析品牌趋势", expected_skill="trend_agent", expected_execution_mode="tool_call", expected_complexity="medium", category="routing", subcategory="keyword_match", paraphrases=["最近的热点话题是什么", "趋势洞察", "Analyze brand trends"], tags=["趋势", "trend"], ), # competitor_analyzer BenchmarkCase( id="route-kw-competitor-001", input="分析我的竞品策略", expected_skill="competitor_analyzer", expected_execution_mode="tool_call", expected_complexity="medium", category="routing", subcategory="keyword_match", paraphrases=["对比我和竞品的差距", "竞品分析", "Analyze competitor strategies"], tags=["竞品", "competitor"], ), # schema_advisor BenchmarkCase( id="route-kw-schema-001", input="帮我优化Schema", expected_skill="schema_advisor", expected_execution_mode="custom", expected_complexity="medium", category="routing", subcategory="keyword_match", paraphrases=["生成JSON-LD结构化数据", "Schema有什么可以改进的", "Optimize my Schema"], tags=["Schema", "schema优化"], ), # monitor BenchmarkCase( id="route-kw-monitor-001", input="监测品牌引用变化", expected_skill="monitor", expected_execution_mode="custom", expected_complexity="medium", category="routing", subcategory="keyword_match", paraphrases=["追踪效果", "品牌排名变化", "Monitor brand citation changes"], tags=["监测", "monitor"], ), # goal_driven_agent BenchmarkCase( id="route-kw-goal-001", input="分析竞品SEO策略并生成优化方案", expected_skill="goal_driven_agent", expected_execution_mode="tool_call", expected_complexity="medium", category="routing", subcategory="keyword_match", paraphrases=[ "调研技术方案并生成对比报告", "制定市场推广计划", "Analyze SEO and generate plan", ], tags=["分析", "优化方案"], ), ] # ═══════════════════════════════════════════════════════════════════════════ # Routing — Edge Cases (manually curated) # ═══════════════════════════════════════════════════════════════════════════ ROUTING_EDGE_BENCHMARKS: list[BenchmarkCase] = [ # Greeting (should NOT route to any skill) BenchmarkCase( id="route-edge-greet-001", input="你好", expected_skill=None, expected_execution_mode="direct", expected_complexity="low", category="routing", subcategory="greeting", paraphrases=["Hello", "Hi there", "早上好"], tags=["greeting"], ), BenchmarkCase( id="route-edge-greet-002", input="Good morning!", expected_skill=None, expected_execution_mode="direct", expected_complexity="low", category="routing", subcategory="greeting", paraphrases=["早上好!", "你好呀"], tags=["greeting"], ), # Identity (should NOT route to any skill) BenchmarkCase( id="route-edge-identity-001", input="你是谁?", expected_skill=None, expected_execution_mode="direct", expected_complexity="low", category="routing", subcategory="identity", paraphrases=["What is your name?", "介绍一下你自己", "Tell me about yourself"], tags=["identity"], ), # Explicit prefix BenchmarkCase( id="route-edge-explicit-001", input="@skill:react_agent 搜索最新的AI新闻", expected_skill="react_agent", expected_execution_mode="react", expected_complexity="high", category="routing", subcategory="explicit_prefix", paraphrases=["@skill:react_agent 查找AI最新动态"], tags=["explicit", "react"], ), # Fallback (no matching skill) BenchmarkCase( id="route-edge-fallback-001", input="告诉我一个笑话", expected_skill=None, expected_execution_mode="direct", expected_complexity="low", category="routing", subcategory="fallback", paraphrases=["讲个笑话", "Tell me a joke", "说个搞笑的"], tags=["fallback"], ), BenchmarkCase( id="route-edge-fallback-002", input="What is quantum physics?", expected_skill=None, expected_execution_mode="direct", expected_complexity="low", category="routing", subcategory="fallback", paraphrases=["量子物理是什么", "Explain quantum mechanics"], tags=["fallback"], ), # Disambiguation (multiple skills could match) BenchmarkCase( id="route-edge-disambig-001", input="审查代码并优化SEO", expected_skill="code_reviewer", expected_execution_mode="direct", expected_complexity="low", category="routing", subcategory="disambiguation", paraphrases=["Review code and optimize SEO", "代码审查加SEO优化"], tags=["disambiguation", "review", "seo"], ), ] # ═══════════════════════════════════════════════════════════════════════════ # Execution Mode Benchmarks # ═══════════════════════════════════════════════════════════════════════════ EXECUTION_BENCHMARKS: list[BenchmarkCase] = [ BenchmarkCase( id="exec-direct-001", input="翻译这段话成英文", expected_skill="direct_agent", expected_execution_mode="direct", expected_complexity="low", category="execution", subcategory="direct_mode", paraphrases=["Translate this to English", "把这段翻成英语"], tags=["direct", "simple"], ), BenchmarkCase( id="exec-direct-002", input="什么是AgentKit?", expected_skill="direct_agent", expected_execution_mode="direct", expected_complexity="low", category="execution", subcategory="direct_mode", paraphrases=["AgentKit是什么", "Explain AgentKit"], tags=["direct", "qa"], ), BenchmarkCase( id="exec-react-001", input="搜索并分析AI行业最新趋势", expected_skill="react_agent", expected_execution_mode="react", expected_complexity="high", category="execution", subcategory="react_mode", paraphrases=["Search and analyze AI trends", "调研AI行业趋势"], tags=["react", "multi_step"], ), BenchmarkCase( id="exec-react-002", input="实时监控竞品动态并生成报告", expected_skill="react_agent", expected_execution_mode="react", expected_complexity="high", category="execution", subcategory="react_mode", paraphrases=["Monitor competitors and report", "追踪竞品并输出报告"], tags=["react", "monitoring"], ), BenchmarkCase( id="exec-rewoo-001", input="批量采集多个竞品的功能数据", expected_skill="rewoo_agent", expected_execution_mode="rewoo", expected_complexity="high", category="execution", subcategory="rewoo_mode", paraphrases=["并行获取竞品数据", "Fetch competitor data in parallel"], tags=["rewoo", "parallel"], ), BenchmarkCase( id="exec-reflexion-001", input="审查代码合规性并确保高精度", expected_skill="reflexion_agent", expected_execution_mode="reflexion", expected_complexity="high", category="execution", subcategory="reflexion_mode", paraphrases=["高精度代码审查", "Precise code compliance review"], tags=["reflexion", "precision"], ), BenchmarkCase( id="exec-planexec-001", input="生成一份完整的市场调研报告", expected_skill="plan_exec_agent", expected_execution_mode="plan_exec", expected_complexity="high", category="execution", subcategory="plan_exec_mode", paraphrases=["做一份市场调研报告", "Generate a market research report"], tags=["plan_exec", "report"], ), BenchmarkCase( id="exec-quality-001", input="生成内容并确保质量达标", expected_skill="content_generator", expected_execution_mode="llm_generate", expected_complexity="low", category="execution", subcategory="quality_gate", paraphrases=["生成高质量内容", "Generate quality content"], tags=["quality", "content"], ), ] # ═══════════════════════════════════════════════════════════════════════════ # Team Collaboration Benchmarks # ═══════════════════════════════════════════════════════════════════════════ TEAM_BENCHMARKS: list[BenchmarkCase] = [ BenchmarkCase( id="team-explicit-001", input="@team:react_agent,plan_exec_agent 协作完成深度分析并生成报告", expected_execution_mode="react", expected_complexity="high", category="team", subcategory="explicit_team", paraphrases=[ "需要react_agent和plan_exec_agent协作", "组建团队:搜索分析+报告生成", ], tags=["team", "explicit"], ), BenchmarkCase( id="team-explicit-002", input="@team:competitor_analyzer,trend_agent 分析竞品并追踪趋势", expected_execution_mode="react", expected_complexity="high", category="team", subcategory="explicit_team", paraphrases=["竞品分析+趋势追踪团队", "Team for competitor and trend analysis"], tags=["team", "explicit"], ), BenchmarkCase( id="team-complexity-001", input="深度分析竞品策略、追踪品牌趋势并生成优化方案", expected_execution_mode="react", expected_complexity="high", category="team", subcategory="complexity_trigger", paraphrases=[ "全面竞品分析和优化方案", "Comprehensive competitor analysis with optimization", ], tags=["team", "complexity"], ), BenchmarkCase( id="team-fallback-001", input="复杂任务但无匹配专家", expected_execution_mode="react", expected_complexity="high", category="team", subcategory="fallback", paraphrases=["需要团队但找不到合适专家", "Complex task without matching experts"], tags=["team", "fallback"], ), BenchmarkCase( id="team-name-valid-001", input="@team:react_agent,plan_exec_agent", expected_execution_mode="react", expected_complexity="high", category="team", subcategory="name_validation", tags=["team", "validation"], ), BenchmarkCase( id="team-name-invalid-001", input="@team:invalid expert name", expected_execution_mode="direct", expected_complexity="low", category="team", subcategory="name_validation", tags=["team", "validation", "invalid"], ), ] # ═══════════════════════════════════════════════════════════════════════════ # Consistency Benchmarks # ═══════════════════════════════════════════════════════════════════════════ CONSISTENCY_BENCHMARKS: list[BenchmarkCase] = [ BenchmarkCase( id="consist-direct-001", input="翻译'hello world'成中文", expected_skill="direct_agent", expected_execution_mode="direct", expected_complexity="low", category="consistency", subcategory="deterministic", tags=["consistency", "translation"], ), BenchmarkCase( id="consist-direct-002", input="什么是RAG?", expected_skill="direct_agent", expected_execution_mode="direct", expected_complexity="low", category="consistency", subcategory="deterministic", tags=["consistency", "qa"], ), BenchmarkCase( id="consist-react-001", input="搜索AI Agent市场数据", expected_skill="react_agent", expected_execution_mode="react", expected_complexity="high", category="consistency", subcategory="deterministic", tags=["consistency", "search"], ), BenchmarkCase( id="consist-geo-001", input="帮我优化这篇文章的SEO", expected_skill="geo_optimizer", expected_execution_mode="llm_generate", expected_complexity="low", category="consistency", subcategory="deterministic", tags=["consistency", "seo"], ), BenchmarkCase( id="consist-deai-001", input="帮我把这篇文章去AI化", expected_skill="deai_agent", expected_execution_mode="llm_generate", expected_complexity="low", category="consistency", subcategory="deterministic", tags=["consistency", "deai"], ), ] # ═══════════════════════════════════════════════════════════════════════════ # Semantic Router Benchmarks # ═══════════════════════════════════════════════════════════════════════════ SEMANTIC_ROUTER_BENCHMARKS: list[BenchmarkCase] = [ BenchmarkCase( id="semantic-direct-001", input="简单生成任务,无需工具调用", expected_skill="direct_agent", expected_execution_mode="direct", expected_complexity="low", category="semantic_router", subcategory="description_match", paraphrases=["只需要一次生成的简单任务", "Single LLM call task"], tags=["semantic", "direct"], ), BenchmarkCase( id="semantic-react-001", input="需要动态适应、逐步推理和工具调用", expected_skill="react_agent", expected_execution_mode="react", expected_complexity="high", category="semantic_router", subcategory="description_match", paraphrases=["需要多步推理和工具", "Multi-step reasoning with tools"], tags=["semantic", "react"], ), BenchmarkCase( id="semantic-rewoo-001", input="多源数据并行采集、无依赖工具调用批量执行", expected_skill="rewoo_agent", expected_execution_mode="rewoo", expected_complexity="high", category="semantic_router", subcategory="description_match", paraphrases=["并行批量获取数据", "Parallel data collection"], tags=["semantic", "rewoo"], ), BenchmarkCase( id="semantic-reflex-001", input="需要高精度和自我验证的任务", expected_skill="reflexion_agent", expected_execution_mode="reflexion", expected_complexity="high", category="semantic_router", subcategory="description_match", paraphrases=["需要自我检查的高精度任务", "High-precision self-verification task"], tags=["semantic", "reflexion"], ), BenchmarkCase( id="semantic-planexec-001", input="结构化多步骤任务,需要可审查的规划和执行", expected_skill="plan_exec_agent", expected_execution_mode="plan_exec", expected_complexity="high", category="semantic_router", subcategory="description_match", paraphrases=["需要先规划再执行的任务", "Structured planning and execution"], tags=["semantic", "plan_exec"], ), BenchmarkCase( id="semantic-geo-001", input="对文章进行GEO/SEO优化,提升在AI搜索引擎中的可见性", expected_skill="geo_optimizer", expected_execution_mode="llm_generate", expected_complexity="low", category="semantic_router", subcategory="description_match", paraphrases=["提升内容搜索排名", "Improve content visibility in AI search"], tags=["semantic", "geo"], ), BenchmarkCase( id="semantic-citation-001", input="检测品牌在各AI平台回答中的引用情况", expected_skill="citation_detector", expected_execution_mode="custom", expected_complexity="medium", category="semantic_router", subcategory="description_match", paraphrases=["分析品牌被AI引用的情况", "Check brand citation across AI platforms"], tags=["semantic", "citation"], ), BenchmarkCase( id="semantic-competitor-001", input="分析竞品策略、对比品牌差距或发现竞争机会", expected_skill="competitor_analyzer", expected_execution_mode="tool_call", expected_complexity="medium", category="semantic_router", subcategory="description_match", paraphrases=["竞品对比和差距分析", "Competitive gap analysis"], tags=["semantic", "competitor"], ), # --- Colloquial / casual expressions (口语化表达) --- BenchmarkCase( id="semantic-colloquial-review-001", input="帮我看看代码有没有问题", expected_skill="code_reviewer", expected_execution_mode="react", expected_complexity="medium", category="semantic_router", subcategory="colloquial_match", paraphrases=["代码审查一下", "Check my code for issues"], tags=["semantic", "colloquial", "code_review"], ), BenchmarkCase( id="semantic-colloquial-trend-001", input="最近市场行情怎么样", expected_skill="trend_agent", expected_execution_mode="tool_call", expected_complexity="medium", category="semantic_router", subcategory="colloquial_match", paraphrases=["市场走势如何", "What's the market trend"], tags=["semantic", "colloquial", "trend"], ), BenchmarkCase( id="semantic-colloquial-content-001", input="帮我写点东西", expected_skill="content_generator", expected_execution_mode="llm_generate", expected_complexity="low", category="semantic_router", subcategory="colloquial_match", paraphrases=["写篇文章吧", "Write something for me"], tags=["semantic", "colloquial", "content"], ), BenchmarkCase( id="semantic-colloquial-citation-001", input="这个引用对不对", expected_skill="citation_detector", expected_execution_mode="custom", expected_complexity="medium", category="semantic_router", subcategory="colloquial_match", paraphrases=["查查引用准不准", "Are these citations correct"], tags=["semantic", "colloquial", "citation"], ), BenchmarkCase( id="semantic-colloquial-competitor-001", input="对手怎么样", expected_skill="competitor_analyzer", expected_execution_mode="tool_call", expected_complexity="medium", category="semantic_router", subcategory="colloquial_match", paraphrases=["竞品啥情况", "How are competitors doing"], tags=["semantic", "colloquial", "competitor"], ), # --- Mixed Chinese-English expressions (中英混合) --- BenchmarkCase( id="semantic-mixed-review-001", input="review一下这段代码", expected_skill="code_reviewer", expected_execution_mode="react", expected_complexity="medium", category="semantic_router", subcategory="mixed_lang_match", paraphrases=["帮我review代码", "Code review please"], tags=["semantic", "mixed", "code_review"], ), BenchmarkCase( id="semantic-mixed-geo-001", input="做个SEO优化", expected_skill="geo_optimizer", expected_execution_mode="llm_generate", expected_complexity="low", category="semantic_router", subcategory="mixed_lang_match", paraphrases=["GEO优化一下", "Optimize for AI search"], tags=["semantic", "mixed", "geo"], ), BenchmarkCase( id="semantic-mixed-monitor-001", input="monitor一下系统状态", expected_skill="monitor", expected_execution_mode="tool_call", expected_complexity="medium", category="semantic_router", subcategory="mixed_lang_match", paraphrases=["监控系统运行", "Monitor system status"], tags=["semantic", "mixed", "monitor"], ), ] # ═══════════════════════════════════════════════════════════════════════════ # Alignment Guard Benchmarks # ═══════════════════════════════════════════════════════════════════════════ ALIGNMENT_BENCHMARKS: list[BenchmarkCase] = [ BenchmarkCase( id="align-negative-001", input="写一篇产品介绍,不要提及价格", expected_skill="content_generator", expected_execution_mode="llm_generate", expected_complexity="low", category="alignment", subcategory="negative_constraint", tags=["alignment", "negative_constraint"], ), BenchmarkCase( id="align-positive-001", input="生成报告,必须包含摘要部分", expected_skill="plan_exec_agent", expected_execution_mode="plan_exec", expected_complexity="high", category="alignment", subcategory="positive_constraint", tags=["alignment", "positive_constraint"], ), BenchmarkCase( id="align-cascade-001", input="反复搜索相同关键词", expected_skill="react_agent", expected_execution_mode="react", expected_complexity="high", category="alignment", subcategory="cascade_detection", tags=["alignment", "cascade"], ), BenchmarkCase( id="align-no-constraint-001", input="帮我写一篇文章", expected_skill="content_generator", expected_execution_mode="llm_generate", expected_complexity="low", category="alignment", subcategory="no_constraint", tags=["alignment", "baseline"], ), BenchmarkCase( id="align-combined-001", input="生成竞品分析报告,必须包含对比表格,不要提及内部数据", expected_skill="competitor_analyzer", expected_execution_mode="tool_call", expected_complexity="medium", category="alignment", subcategory="combined_constraint", tags=["alignment", "combined"], ), ] # ═══════════════════════════════════════════════════════════════════════════ # Board Meeting (Private Board) Benchmarks — @board prefix routing # ═══════════════════════════════════════════════════════════════════════════ BOARD_BENCHMARKS: list[BenchmarkCase] = [ # --- Default template (@board without experts) --- BenchmarkCase( id="board-default-001", input="@board 讨论是否应该进入东南亚市场", expected_execution_mode="board", expected_complexity="high", category="board", subcategory="default_template", paraphrases=[ "@board 我们要不要拓展东南亚业务", "@board 东南亚市场进入策略讨论", "@board:private_board 评估东南亚市场机会", ], tags=["board", "default", "strategy"], ), BenchmarkCase( id="board-default-002", input="@board AI产品定价策略应该怎么做", expected_execution_mode="board", expected_complexity="high", category="board", subcategory="default_template", paraphrases=["@board 如何给AI产品定价", "@board AI产品定价讨论"], tags=["board", "default", "pricing"], ), # --- Explicit expert list (@board:expert1,expert2) --- BenchmarkCase( id="board-explicit-001", input="@board:elon_musk,jeff_bezos 讨论火星殖民的商业化路径", expected_execution_mode="board", expected_complexity="high", category="board", subcategory="explicit_experts", paraphrases=[ "@board:elon_musk,jeff_bezos 火星商业化方案", "@board:jeff_bezos,elon_musk 如何商业化火星", ], tags=["board", "explicit", "mars"], ), BenchmarkCase( id="board-explicit-002", input="@board:charlie_munger,warren_buffett 价值投资在AI时代的适用性", expected_execution_mode="board", expected_complexity="high", category="board", subcategory="explicit_experts", paraphrases=[ "@board:charlie_munger,warren_buffett AI时代还要不要价值投资", ], tags=["board", "explicit", "investing"], ), # --- Explicit default template name --- BenchmarkCase( id="board-template-001", input="@board:private_board 讨论创业公司融资节奏", expected_execution_mode="board", expected_complexity="high", category="board", subcategory="explicit_template", paraphrases=["@board:private_board 创业融资策略", "@board:private_board 融资节奏讨论"], tags=["board", "template", "fundraising"], ), # --- Edge cases --- BenchmarkCase( id="board-edge-empty-topic-001", input="@board", expected_execution_mode="board", expected_complexity="low", category="board", subcategory="empty_topic", tags=["board", "edge", "empty"], ), BenchmarkCase( id="board-edge-no-prefix-001", input="讨论一下市场策略", expected_execution_mode="react", expected_complexity="medium", category="board", subcategory="no_prefix", paraphrases=["分析市场策略", "市场策略讨论"], tags=["board", "edge", "no_match"], ), # --- Name validation --- BenchmarkCase( id="board-name-valid-001", input="@board:elon_musk,jeff_bezos,allenzhang 产品设计哲学", expected_execution_mode="board", expected_complexity="high", category="board", subcategory="name_validation", tags=["board", "validation", "valid"], ), BenchmarkCase( id="board-name-invalid-001", input="@board:@#$ 讨论主题", expected_execution_mode="board", expected_complexity="low", category="board", subcategory="name_validation", tags=["board", "validation", "invalid"], ), # --- Stop command (user intervention) --- BenchmarkCase( id="board-stop-001", input="/stop", expected_execution_mode="board", expected_complexity="low", category="board", subcategory="stop_command", paraphrases=["停止讨论", "结束讨论"], tags=["board", "stop", "intervention"], ), ] # ═══════════════════════════════════════════════════════════════════════════ # All benchmarks combined # ═══════════════════════════════════════════════════════════════════════════ ALL_BENCHMARKS: list[BenchmarkCase] = ( ROUTING_KEYWORD_BENCHMARKS + ROUTING_EDGE_BENCHMARKS + EXECUTION_BENCHMARKS + TEAM_BENCHMARKS + CONSISTENCY_BENCHMARKS + SEMANTIC_ROUTER_BENCHMARKS + ALIGNMENT_BENCHMARKS + BOARD_BENCHMARKS ) def get_benchmarks_by_category(category: str) -> list[BenchmarkCase]: """Filter benchmarks by category.""" return [b for b in ALL_BENCHMARKS if b.category == category] def get_benchmarks_by_subcategory(subcategory: str) -> list[BenchmarkCase]: """Filter benchmarks by subcategory.""" return [b for b in ALL_BENCHMARKS if b.subcategory == subcategory] def get_benchmarks_with_paraphrases() -> list[BenchmarkCase]: """Get only benchmarks that have paraphrases (for overfitting detection).""" return [b for b in ALL_BENCHMARKS if b.paraphrases] def get_skill_names_needed() -> set[str]: """Get all skill names referenced in benchmarks (for pre-registration).""" return {b.expected_skill for b in ALL_BENCHMARKS if b.expected_skill is not None} def get_benchmark_stats() -> dict[str, int]: """Get benchmark count by category.""" stats: dict[str, int] = {} for b in ALL_BENCHMARKS: stats[b.category] = stats.get(b.category, 0) + 1 stats["total"] = len(ALL_BENCHMARKS) return stats