fischer-agentkit/tests/e2e/benchmark_dataset.py

"""Agent Capability Benchmark — Ground Truth Dataset (v2).

Aligned with actual skills in configs/skills/*.yaml.
Contains both manually curated edge cases and auto-generated cases.

Categories:
  - routing: intent routing correctness
  - execution: execution mode selection accuracy
  - team: expert team collaboration
  - consistency: deterministic output consistency
  - semantic_router: semantic similarity matching
  - alignment: constraint compliance and cascade detection
"""

from pydantic import BaseModel, ConfigDict


class BenchmarkCase(BaseModel):
    """A single benchmark test case with ground truth label."""

    model_config = ConfigDict(frozen=True)

    id: str
    input: str
    expected_skill: str | None = None
    expected_execution_mode: str = "direct"
    expected_complexity: str = "low"
    category: str
    subcategory: str
    paraphrases: list[str] = []
    tags: list[str] = []


# ═══════════════════════════════════════════════════════════════════════════
# Routing — Keyword Match (aligned with actual skills)
# ═══════════════════════════════════════════════════════════════════════════

ROUTING_KEYWORD_BENCHMARKS: list[BenchmarkCase] = [
    # direct_agent
    BenchmarkCase(
        id="route-kw-direct-001",
        input="翻译这段话",
        expected_skill="direct_agent",
        expected_execution_mode="direct",
        expected_complexity="low",
        category="routing",
        subcategory="keyword_match",
        paraphrases=["帮我翻译一下", "请翻译这段内容", "Translate this text"],
        tags=["翻译", "translate"],
    ),
    BenchmarkCase(
        id="route-kw-direct-002",
        input="帮我总结一下",
        expected_skill="direct_agent",
        expected_execution_mode="direct",
        expected_complexity="low",
        category="routing",
        subcategory="keyword_match",
        paraphrases=["请总结", "给我一个摘要", "Summarize this"],
        tags=["摘要", "summarize"],
    ),
    BenchmarkCase(
        id="route-kw-direct-003",
        input="什么是RAG？",
        expected_skill="direct_agent",
        expected_execution_mode="direct",
        expected_complexity="low",
        category="routing",
        subcategory="keyword_match",
        paraphrases=["RAG是什么", "解释一下RAG", "What is RAG?"],
        tags=["什么是"],
    ),
    # react_agent
    BenchmarkCase(
        id="route-kw-react-001",
        input="搜索一下AI Agent市场数据",
        expected_skill="react_agent",
        expected_execution_mode="react",
        expected_complexity="high",
        category="routing",
        subcategory="keyword_match",
        paraphrases=[
            "帮我搜索AI Agent市场信息",
            "查找AI Agent的市场数据",
            "Search AI Agent market data",
        ],
        tags=["搜索", "search"],
    ),
    BenchmarkCase(
        id="route-kw-react-002",
        input="帮我分析这个数据",
        expected_skill="react_agent",
        expected_execution_mode="react",
        expected_complexity="high",
        category="routing",
        subcategory="keyword_match",
        paraphrases=["分析一下这些数据", "请对数据做分析", "Analyze this data"],
        tags=["分析", "analyze"],
    ),
    BenchmarkCase(
        id="route-kw-react-003",
        input="实时监控竞品动态",
        expected_skill="react_agent",
        expected_execution_mode="react",
        expected_complexity="high",
        category="routing",
        subcategory="keyword_match",
        paraphrases=["监控竞争对手的动态", "实时追踪竞品变化", "Monitor competitor activities"],
        tags=["实时", "监控"],
    ),
    # rewoo_agent
    BenchmarkCase(
        id="route-kw-rewoo-001",
        input="采集A、B、C三个竞品的功能数据",
        expected_skill="rewoo_agent",
        expected_execution_mode="rewoo",
        expected_complexity="high",
        category="routing",
        subcategory="keyword_match",
        paraphrases=[
            "批量采集竞品数据",
            "并行获取多个竞品信息",
            "Fetch data from multiple competitors",
        ],
        tags=["采集", "批量", "fetch"],
    ),
    BenchmarkCase(
        id="route-kw-rewoo-002",
        input="并行搜索多个关键词",
        expected_skill="rewoo_agent",
        expected_execution_mode="rewoo",
        expected_complexity="high",
        category="routing",
        subcategory="keyword_match",
        paraphrases=["同时搜索多个关键词", "批量搜索", "Search multiple keywords in parallel"],
        tags=["并行", "批量"],
    ),
    # reflexion_agent
    BenchmarkCase(
        id="route-kw-reflex-001",
        input="审查这段代码的合规性",
        expected_skill="reflexion_agent",
        expected_execution_mode="reflexion",
        expected_complexity="high",
        category="routing",
        subcategory="keyword_match",
        paraphrases=["检查代码是否合规", "审查代码合规问题", "Review code compliance"],
        tags=["审查", "合规", "review"],
    ),
    BenchmarkCase(
        id="route-kw-reflex-002",
        input="生成一个高精度的数据分析脚本",
        expected_skill="reflexion_agent",
        expected_execution_mode="reflexion",
        expected_complexity="high",
        category="routing",
        subcategory="keyword_match",
        paraphrases=[
            "写一个精确的数据分析脚本",
            "生成高精度分析代码",
            "Generate a precise analysis script",
        ],
        tags=["代码生成", "精确", "code"],
    ),
    # plan_exec_agent
    BenchmarkCase(
        id="route-kw-planexec-001",
        input="生成一份市场分析报告",
        expected_skill="plan_exec_agent",
        expected_execution_mode="plan_exec",
        expected_complexity="high",
        category="routing",
        subcategory="keyword_match",
        paraphrases=["做一份市场分析报告", "写个市场分析报告", "Generate a market analysis report"],
        tags=["报告", "分析报告"],
    ),
    BenchmarkCase(
        id="route-kw-planexec-002",
        input="规划产品优化方案",
        expected_skill="plan_exec_agent",
        expected_execution_mode="plan_exec",
        expected_complexity="high",
        category="routing",
        subcategory="keyword_match",
        paraphrases=["制定产品优化计划", "帮我规划产品优化", "Plan product optimization"],
        tags=["规划", "plan"],
    ),
    # code_reviewer
    BenchmarkCase(
        id="route-kw-coderev-001",
        input="Review this code for quality",
        expected_skill="code_reviewer",
        expected_execution_mode="direct",
        expected_complexity="low",
        category="routing",
        subcategory="keyword_match",
        paraphrases=["审查这段代码的质量", "代码审查", "Check code quality"],
        tags=["review", "代码审查"],
    ),
    # geo_optimizer
    BenchmarkCase(
        id="route-kw-geo-001",
        input="帮我优化这篇文章的SEO",
        expected_skill="geo_optimizer",
        expected_execution_mode="llm_generate",
        expected_complexity="low",
        category="routing",
        subcategory="keyword_match",
        paraphrases=["SEO优化一下", "提升文章搜索排名", "Optimize this article for SEO"],
        tags=["SEO优化", "optimize"],
    ),
    # deai_agent
    BenchmarkCase(
        id="route-kw-deai-001",
        input="帮我把这篇文章去AI化",
        expected_skill="deai_agent",
        expected_execution_mode="llm_generate",
        expected_complexity="low",
        category="routing",
        subcategory="keyword_match",
        paraphrases=["让这段文字更自然", "改写得像人写的", "Make this text more natural"],
        tags=["去AI化", "人性化"],
    ),
    # content_generator
    BenchmarkCase(
        id="route-kw-content-001",
        input="帮我写一篇关于AI的文章",
        expected_skill="content_generator",
        expected_execution_mode="llm_generate",
        expected_complexity="low",
        category="routing",
        subcategory="keyword_match",
        paraphrases=["写一篇AI相关文章", "生成关于AI的内容", "Write an article about AI"],
        tags=["写文章", "generate"],
    ),
    # citation_detector
    BenchmarkCase(
        id="route-kw-citation-001",
        input="检测我们的品牌在AI平台的引用情况",
        expected_skill="citation_detector",
        expected_execution_mode="custom",
        expected_complexity="medium",
        category="routing",
        subcategory="keyword_match",
        paraphrases=[
            "分析品牌引用率",
            "哪些AI平台引用了我们",
            "Check brand citation on AI platforms",
        ],
        tags=["引用检测", "citation"],
    ),
    # trend_agent
    BenchmarkCase(
        id="route-kw-trend-001",
        input="分析品牌趋势",
        expected_skill="trend_agent",
        expected_execution_mode="tool_call",
        expected_complexity="medium",
        category="routing",
        subcategory="keyword_match",
        paraphrases=["最近的热点话题是什么", "趋势洞察", "Analyze brand trends"],
        tags=["趋势", "trend"],
    ),
    # competitor_analyzer
    BenchmarkCase(
        id="route-kw-competitor-001",
        input="分析我的竞品策略",
        expected_skill="competitor_analyzer",
        expected_execution_mode="tool_call",
        expected_complexity="medium",
        category="routing",
        subcategory="keyword_match",
        paraphrases=["对比我和竞品的差距", "竞品分析", "Analyze competitor strategies"],
        tags=["竞品", "competitor"],
    ),
    # schema_advisor
    BenchmarkCase(
        id="route-kw-schema-001",
        input="帮我优化Schema",
        expected_skill="schema_advisor",
        expected_execution_mode="custom",
        expected_complexity="medium",
        category="routing",
        subcategory="keyword_match",
        paraphrases=["生成JSON-LD结构化数据", "Schema有什么可以改进的", "Optimize my Schema"],
        tags=["Schema", "schema优化"],
    ),
    # monitor
    BenchmarkCase(
        id="route-kw-monitor-001",
        input="监测品牌引用变化",
        expected_skill="monitor",
        expected_execution_mode="custom",
        expected_complexity="medium",
        category="routing",
        subcategory="keyword_match",
        paraphrases=["追踪效果", "品牌排名变化", "Monitor brand citation changes"],
        tags=["监测", "monitor"],
    ),
    # goal_driven_agent
    BenchmarkCase(
        id="route-kw-goal-001",
        input="分析竞品SEO策略并生成优化方案",
        expected_skill="goal_driven_agent",
        expected_execution_mode="tool_call",
        expected_complexity="medium",
        category="routing",
        subcategory="keyword_match",
        paraphrases=[
            "调研技术方案并生成对比报告",
            "制定市场推广计划",
            "Analyze SEO and generate plan",
        ],
        tags=["分析", "优化方案"],
    ),
]


# ═══════════════════════════════════════════════════════════════════════════
# Routing — Edge Cases (manually curated)
# ═══════════════════════════════════════════════════════════════════════════

ROUTING_EDGE_BENCHMARKS: list[BenchmarkCase] = [
    # Greeting (should NOT route to any skill)
    BenchmarkCase(
        id="route-edge-greet-001",
        input="你好",
        expected_skill=None,
        expected_execution_mode="direct",
        expected_complexity="low",
        category="routing",
        subcategory="greeting",
        paraphrases=["Hello", "Hi there", "早上好"],
        tags=["greeting"],
    ),
    BenchmarkCase(
        id="route-edge-greet-002",
        input="Good morning!",
        expected_skill=None,
        expected_execution_mode="direct",
        expected_complexity="low",
        category="routing",
        subcategory="greeting",
        paraphrases=["早上好！", "你好呀"],
        tags=["greeting"],
    ),
    # Identity (should NOT route to any skill)
    BenchmarkCase(
        id="route-edge-identity-001",
        input="你是谁？",
        expected_skill=None,
        expected_execution_mode="direct",
        expected_complexity="low",
        category="routing",
        subcategory="identity",
        paraphrases=["What is your name?", "介绍一下你自己", "Tell me about yourself"],
        tags=["identity"],
    ),
    # Explicit prefix
    BenchmarkCase(
        id="route-edge-explicit-001",
        input="@skill:react_agent 搜索最新的AI新闻",
        expected_skill="react_agent",
        expected_execution_mode="react",
        expected_complexity="high",
        category="routing",
        subcategory="explicit_prefix",
        paraphrases=["@skill:react_agent 查找AI最新动态"],
        tags=["explicit", "react"],
    ),
    # Fallback (no matching skill)
    BenchmarkCase(
        id="route-edge-fallback-001",
        input="告诉我一个笑话",
        expected_skill=None,
        expected_execution_mode="direct",
        expected_complexity="low",
        category="routing",
        subcategory="fallback",
        paraphrases=["讲个笑话", "Tell me a joke", "说个搞笑的"],
        tags=["fallback"],
    ),
    BenchmarkCase(
        id="route-edge-fallback-002",
        input="What is quantum physics?",
        expected_skill=None,
        expected_execution_mode="direct",
        expected_complexity="low",
        category="routing",
        subcategory="fallback",
        paraphrases=["量子物理是什么", "Explain quantum mechanics"],
        tags=["fallback"],
    ),
    # Disambiguation (multiple skills could match)
    BenchmarkCase(
        id="route-edge-disambig-001",
        input="审查代码并优化SEO",
        expected_skill="code_reviewer",
        expected_execution_mode="direct",
        expected_complexity="low",
        category="routing",
        subcategory="disambiguation",
        paraphrases=["Review code and optimize SEO", "代码审查加SEO优化"],
        tags=["disambiguation", "review", "seo"],
    ),
]


# ═══════════════════════════════════════════════════════════════════════════
# Execution Mode Benchmarks
# ═══════════════════════════════════════════════════════════════════════════

EXECUTION_BENCHMARKS: list[BenchmarkCase] = [
    BenchmarkCase(
        id="exec-direct-001",
        input="翻译这段话成英文",
        expected_skill="direct_agent",
        expected_execution_mode="direct",
        expected_complexity="low",
        category="execution",
        subcategory="direct_mode",
        paraphrases=["Translate this to English", "把这段翻成英语"],
        tags=["direct", "simple"],
    ),
    BenchmarkCase(
        id="exec-direct-002",
        input="什么是AgentKit？",
        expected_skill="direct_agent",
        expected_execution_mode="direct",
        expected_complexity="low",
        category="execution",
        subcategory="direct_mode",
        paraphrases=["AgentKit是什么", "Explain AgentKit"],
        tags=["direct", "qa"],
    ),
    BenchmarkCase(
        id="exec-react-001",
        input="搜索并分析AI行业最新趋势",
        expected_skill="react_agent",
        expected_execution_mode="react",
        expected_complexity="high",
        category="execution",
        subcategory="react_mode",
        paraphrases=["Search and analyze AI trends", "调研AI行业趋势"],
        tags=["react", "multi_step"],
    ),
    BenchmarkCase(
        id="exec-react-002",
        input="实时监控竞品动态并生成报告",
        expected_skill="react_agent",
        expected_execution_mode="react",
        expected_complexity="high",
        category="execution",
        subcategory="react_mode",
        paraphrases=["Monitor competitors and report", "追踪竞品并输出报告"],
        tags=["react", "monitoring"],
    ),
    BenchmarkCase(
        id="exec-rewoo-001",
        input="批量采集多个竞品的功能数据",
        expected_skill="rewoo_agent",
        expected_execution_mode="rewoo",
        expected_complexity="high",
        category="execution",
        subcategory="rewoo_mode",
        paraphrases=["并行获取竞品数据", "Fetch competitor data in parallel"],
        tags=["rewoo", "parallel"],
    ),
    BenchmarkCase(
        id="exec-reflexion-001",
        input="审查代码合规性并确保高精度",
        expected_skill="reflexion_agent",
        expected_execution_mode="reflexion",
        expected_complexity="high",
        category="execution",
        subcategory="reflexion_mode",
        paraphrases=["高精度代码审查", "Precise code compliance review"],
        tags=["reflexion", "precision"],
    ),
    BenchmarkCase(
        id="exec-planexec-001",
        input="生成一份完整的市场调研报告",
        expected_skill="plan_exec_agent",
        expected_execution_mode="plan_exec",
        expected_complexity="high",
        category="execution",
        subcategory="plan_exec_mode",
        paraphrases=["做一份市场调研报告", "Generate a market research report"],
        tags=["plan_exec", "report"],
    ),
    BenchmarkCase(
        id="exec-quality-001",
        input="生成内容并确保质量达标",
        expected_skill="content_generator",
        expected_execution_mode="llm_generate",
        expected_complexity="low",
        category="execution",
        subcategory="quality_gate",
        paraphrases=["生成高质量内容", "Generate quality content"],
        tags=["quality", "content"],
    ),
]


# ═══════════════════════════════════════════════════════════════════════════
# Team Collaboration Benchmarks
# ═══════════════════════════════════════════════════════════════════════════

TEAM_BENCHMARKS: list[BenchmarkCase] = [
    BenchmarkCase(
        id="team-explicit-001",
        input="@team:react_agent,plan_exec_agent 协作完成深度分析并生成报告",
        expected_execution_mode="react",
        expected_complexity="high",
        category="team",
        subcategory="explicit_team",
        paraphrases=[
            "需要react_agent和plan_exec_agent协作",
            "组建团队：搜索分析+报告生成",
        ],
        tags=["team", "explicit"],
    ),
    BenchmarkCase(
        id="team-explicit-002",
        input="@team:competitor_analyzer,trend_agent 分析竞品并追踪趋势",
        expected_execution_mode="react",
        expected_complexity="high",
        category="team",
        subcategory="explicit_team",
        paraphrases=["竞品分析+趋势追踪团队", "Team for competitor and trend analysis"],
        tags=["team", "explicit"],
    ),
    BenchmarkCase(
        id="team-complexity-001",
        input="深度分析竞品策略、追踪品牌趋势并生成优化方案",
        expected_execution_mode="react",
        expected_complexity="high",
        category="team",
        subcategory="complexity_trigger",
        paraphrases=[
            "全面竞品分析和优化方案",
            "Comprehensive competitor analysis with optimization",
        ],
        tags=["team", "complexity"],
    ),
    BenchmarkCase(
        id="team-fallback-001",
        input="复杂任务但无匹配专家",
        expected_execution_mode="react",
        expected_complexity="high",
        category="team",
        subcategory="fallback",
        paraphrases=["需要团队但找不到合适专家", "Complex task without matching experts"],
        tags=["team", "fallback"],
    ),
    BenchmarkCase(
        id="team-name-valid-001",
        input="@team:react_agent,plan_exec_agent",
        expected_execution_mode="react",
        expected_complexity="high",
        category="team",
        subcategory="name_validation",
        tags=["team", "validation"],
    ),
    BenchmarkCase(
        id="team-name-invalid-001",
        input="@team:invalid expert name",
        expected_execution_mode="direct",
        expected_complexity="low",
        category="team",
        subcategory="name_validation",
        tags=["team", "validation", "invalid"],
    ),
]


# ═══════════════════════════════════════════════════════════════════════════
# Consistency Benchmarks
# ═══════════════════════════════════════════════════════════════════════════

CONSISTENCY_BENCHMARKS: list[BenchmarkCase] = [
    BenchmarkCase(
        id="consist-direct-001",
        input="翻译'hello world'成中文",
        expected_skill="direct_agent",
        expected_execution_mode="direct",
        expected_complexity="low",
        category="consistency",
        subcategory="deterministic",
        tags=["consistency", "translation"],
    ),
    BenchmarkCase(
        id="consist-direct-002",
        input="什么是RAG？",
        expected_skill="direct_agent",
        expected_execution_mode="direct",
        expected_complexity="low",
        category="consistency",
        subcategory="deterministic",
        tags=["consistency", "qa"],
    ),
    BenchmarkCase(
        id="consist-react-001",
        input="搜索AI Agent市场数据",
        expected_skill="react_agent",
        expected_execution_mode="react",
        expected_complexity="high",
        category="consistency",
        subcategory="deterministic",
        tags=["consistency", "search"],
    ),
    BenchmarkCase(
        id="consist-geo-001",
        input="帮我优化这篇文章的SEO",
        expected_skill="geo_optimizer",
        expected_execution_mode="llm_generate",
        expected_complexity="low",
        category="consistency",
        subcategory="deterministic",
        tags=["consistency", "seo"],
    ),
    BenchmarkCase(
        id="consist-deai-001",
        input="帮我把这篇文章去AI化",
        expected_skill="deai_agent",
        expected_execution_mode="llm_generate",
        expected_complexity="low",
        category="consistency",
        subcategory="deterministic",
        tags=["consistency", "deai"],
    ),
]


# ═══════════════════════════════════════════════════════════════════════════
# Semantic Router Benchmarks
# ═══════════════════════════════════════════════════════════════════════════

SEMANTIC_ROUTER_BENCHMARKS: list[BenchmarkCase] = [
    BenchmarkCase(
        id="semantic-direct-001",
        input="简单生成任务，无需工具调用",
        expected_skill="direct_agent",
        expected_execution_mode="direct",
        expected_complexity="low",
        category="semantic_router",
        subcategory="description_match",
        paraphrases=["只需要一次生成的简单任务", "Single LLM call task"],
        tags=["semantic", "direct"],
    ),
    BenchmarkCase(
        id="semantic-react-001",
        input="需要动态适应、逐步推理和工具调用",
        expected_skill="react_agent",
        expected_execution_mode="react",
        expected_complexity="high",
        category="semantic_router",
        subcategory="description_match",
        paraphrases=["需要多步推理和工具", "Multi-step reasoning with tools"],
        tags=["semantic", "react"],
    ),
    BenchmarkCase(
        id="semantic-rewoo-001",
        input="多源数据并行采集、无依赖工具调用批量执行",
        expected_skill="rewoo_agent",
        expected_execution_mode="rewoo",
        expected_complexity="high",
        category="semantic_router",
        subcategory="description_match",
        paraphrases=["并行批量获取数据", "Parallel data collection"],
        tags=["semantic", "rewoo"],
    ),
    BenchmarkCase(
        id="semantic-reflex-001",
        input="需要高精度和自我验证的任务",
        expected_skill="reflexion_agent",
        expected_execution_mode="reflexion",
        expected_complexity="high",
        category="semantic_router",
        subcategory="description_match",
        paraphrases=["需要自我检查的高精度任务", "High-precision self-verification task"],
        tags=["semantic", "reflexion"],
    ),
    BenchmarkCase(
        id="semantic-planexec-001",
        input="结构化多步骤任务，需要可审查的规划和执行",
        expected_skill="plan_exec_agent",
        expected_execution_mode="plan_exec",
        expected_complexity="high",
        category="semantic_router",
        subcategory="description_match",
        paraphrases=["需要先规划再执行的任务", "Structured planning and execution"],
        tags=["semantic", "plan_exec"],
    ),
    BenchmarkCase(
        id="semantic-geo-001",
        input="对文章进行GEO/SEO优化，提升在AI搜索引擎中的可见性",
        expected_skill="geo_optimizer",
        expected_execution_mode="llm_generate",
        expected_complexity="low",
        category="semantic_router",
        subcategory="description_match",
        paraphrases=["提升内容搜索排名", "Improve content visibility in AI search"],
        tags=["semantic", "geo"],
    ),
    BenchmarkCase(
        id="semantic-citation-001",
        input="检测品牌在各AI平台回答中的引用情况",
        expected_skill="citation_detector",
        expected_execution_mode="custom",
        expected_complexity="medium",
        category="semantic_router",
        subcategory="description_match",
        paraphrases=["分析品牌被AI引用的情况", "Check brand citation across AI platforms"],
        tags=["semantic", "citation"],
    ),
    BenchmarkCase(
        id="semantic-competitor-001",
        input="分析竞品策略、对比品牌差距或发现竞争机会",
        expected_skill="competitor_analyzer",
        expected_execution_mode="tool_call",
        expected_complexity="medium",
        category="semantic_router",
        subcategory="description_match",
        paraphrases=["竞品对比和差距分析", "Competitive gap analysis"],
        tags=["semantic", "competitor"],
    ),
]


# ═══════════════════════════════════════════════════════════════════════════
# Alignment Guard Benchmarks
# ═══════════════════════════════════════════════════════════════════════════

ALIGNMENT_BENCHMARKS: list[BenchmarkCase] = [
    BenchmarkCase(
        id="align-negative-001",
        input="写一篇产品介绍，不要提及价格",
        expected_skill="content_generator",
        expected_execution_mode="llm_generate",
        expected_complexity="low",
        category="alignment",
        subcategory="negative_constraint",
        tags=["alignment", "negative_constraint"],
    ),
    BenchmarkCase(
        id="align-positive-001",
        input="生成报告，必须包含摘要部分",
        expected_skill="plan_exec_agent",
        expected_execution_mode="plan_exec",
        expected_complexity="high",
        category="alignment",
        subcategory="positive_constraint",
        tags=["alignment", "positive_constraint"],
    ),
    BenchmarkCase(
        id="align-cascade-001",
        input="反复搜索相同关键词",
        expected_skill="react_agent",
        expected_execution_mode="react",
        expected_complexity="high",
        category="alignment",
        subcategory="cascade_detection",
        tags=["alignment", "cascade"],
    ),
    BenchmarkCase(
        id="align-no-constraint-001",
        input="帮我写一篇文章",
        expected_skill="content_generator",
        expected_execution_mode="llm_generate",
        expected_complexity="low",
        category="alignment",
        subcategory="no_constraint",
        tags=["alignment", "baseline"],
    ),
    BenchmarkCase(
        id="align-combined-001",
        input="生成竞品分析报告，必须包含对比表格，不要提及内部数据",
        expected_skill="competitor_analyzer",
        expected_execution_mode="tool_call",
        expected_complexity="medium",
        category="alignment",
        subcategory="combined_constraint",
        tags=["alignment", "combined"],
    ),
]


# ═══════════════════════════════════════════════════════════════════════════
# All benchmarks combined
# ═══════════════════════════════════════════════════════════════════════════

ALL_BENCHMARKS: list[BenchmarkCase] = (
    ROUTING_KEYWORD_BENCHMARKS
    + ROUTING_EDGE_BENCHMARKS
    + EXECUTION_BENCHMARKS
    + TEAM_BENCHMARKS
    + CONSISTENCY_BENCHMARKS
    + SEMANTIC_ROUTER_BENCHMARKS
    + ALIGNMENT_BENCHMARKS
)


def get_benchmarks_by_category(category: str) -> list[BenchmarkCase]:
    """Filter benchmarks by category."""
    return [b for b in ALL_BENCHMARKS if b.category == category]


def get_benchmarks_by_subcategory(subcategory: str) -> list[BenchmarkCase]:
    """Filter benchmarks by subcategory."""
    return [b for b in ALL_BENCHMARKS if b.subcategory == subcategory]


def get_benchmarks_with_paraphrases() -> list[BenchmarkCase]:
    """Get only benchmarks that have paraphrases (for overfitting detection)."""
    return [b for b in ALL_BENCHMARKS if b.paraphrases]


def get_skill_names_needed() -> set[str]:
    """Get all skill names referenced in benchmarks (for pre-registration)."""
    return {b.expected_skill for b in ALL_BENCHMARKS if b.expected_skill is not None}


def get_benchmark_stats() -> dict[str, int]:
    """Get benchmark count by category."""
    stats: dict[str, int] = {}
    for b in ALL_BENCHMARKS:
        stats[b.category] = stats.get(b.category, 0) + 1
    stats["total"] = len(ALL_BENCHMARKS)
    return stats