831 lines
31 KiB
Python
831 lines
31 KiB
Python
"""Agent Capability Benchmark — Ground Truth Dataset (v2).
|
||
|
||
Aligned with actual skills in configs/skills/*.yaml.
|
||
Contains both manually curated edge cases and auto-generated cases.
|
||
|
||
Categories:
|
||
- routing: intent routing correctness
|
||
- execution: execution mode selection accuracy
|
||
- team: expert team collaboration
|
||
- consistency: deterministic output consistency
|
||
- semantic_router: semantic similarity matching
|
||
- alignment: constraint compliance and cascade detection
|
||
"""
|
||
|
||
from pydantic import BaseModel, ConfigDict
|
||
|
||
|
||
class BenchmarkCase(BaseModel):
|
||
"""A single benchmark test case with ground truth label."""
|
||
|
||
model_config = ConfigDict(frozen=True)
|
||
|
||
id: str
|
||
input: str
|
||
expected_skill: str | None = None
|
||
expected_execution_mode: str = "direct"
|
||
expected_complexity: str = "low"
|
||
category: str
|
||
subcategory: str
|
||
paraphrases: list[str] = []
|
||
tags: list[str] = []
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
# Routing — Keyword Match (aligned with actual skills)
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
|
||
ROUTING_KEYWORD_BENCHMARKS: list[BenchmarkCase] = [
|
||
# direct_agent
|
||
BenchmarkCase(
|
||
id="route-kw-direct-001",
|
||
input="翻译这段话",
|
||
expected_skill="direct_agent",
|
||
expected_execution_mode="direct",
|
||
expected_complexity="low",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=["帮我翻译一下", "请翻译这段内容", "Translate this text"],
|
||
tags=["翻译", "translate"],
|
||
),
|
||
BenchmarkCase(
|
||
id="route-kw-direct-002",
|
||
input="帮我总结一下",
|
||
expected_skill="direct_agent",
|
||
expected_execution_mode="direct",
|
||
expected_complexity="low",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=["请总结", "给我一个摘要", "Summarize this"],
|
||
tags=["摘要", "summarize"],
|
||
),
|
||
BenchmarkCase(
|
||
id="route-kw-direct-003",
|
||
input="什么是RAG?",
|
||
expected_skill="direct_agent",
|
||
expected_execution_mode="direct",
|
||
expected_complexity="low",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=["RAG是什么", "解释一下RAG", "What is RAG?"],
|
||
tags=["什么是"],
|
||
),
|
||
# react_agent
|
||
BenchmarkCase(
|
||
id="route-kw-react-001",
|
||
input="搜索一下AI Agent市场数据",
|
||
expected_skill="react_agent",
|
||
expected_execution_mode="react",
|
||
expected_complexity="high",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=[
|
||
"帮我搜索AI Agent市场信息",
|
||
"查找AI Agent的市场数据",
|
||
"Search AI Agent market data",
|
||
],
|
||
tags=["搜索", "search"],
|
||
),
|
||
BenchmarkCase(
|
||
id="route-kw-react-002",
|
||
input="帮我分析这个数据",
|
||
expected_skill="react_agent",
|
||
expected_execution_mode="react",
|
||
expected_complexity="high",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=["分析一下这些数据", "请对数据做分析", "Analyze this data"],
|
||
tags=["分析", "analyze"],
|
||
),
|
||
BenchmarkCase(
|
||
id="route-kw-react-003",
|
||
input="实时监控竞品动态",
|
||
expected_skill="react_agent",
|
||
expected_execution_mode="react",
|
||
expected_complexity="high",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=["监控竞争对手的动态", "实时追踪竞品变化", "Monitor competitor activities"],
|
||
tags=["实时", "监控"],
|
||
),
|
||
# rewoo_agent
|
||
BenchmarkCase(
|
||
id="route-kw-rewoo-001",
|
||
input="采集A、B、C三个竞品的功能数据",
|
||
expected_skill="rewoo_agent",
|
||
expected_execution_mode="rewoo",
|
||
expected_complexity="high",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=[
|
||
"批量采集竞品数据",
|
||
"并行获取多个竞品信息",
|
||
"Fetch data from multiple competitors",
|
||
],
|
||
tags=["采集", "批量", "fetch"],
|
||
),
|
||
BenchmarkCase(
|
||
id="route-kw-rewoo-002",
|
||
input="并行搜索多个关键词",
|
||
expected_skill="rewoo_agent",
|
||
expected_execution_mode="rewoo",
|
||
expected_complexity="high",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=["同时搜索多个关键词", "批量搜索", "Search multiple keywords in parallel"],
|
||
tags=["并行", "批量"],
|
||
),
|
||
# reflexion_agent
|
||
BenchmarkCase(
|
||
id="route-kw-reflex-001",
|
||
input="审查这段代码的合规性",
|
||
expected_skill="reflexion_agent",
|
||
expected_execution_mode="reflexion",
|
||
expected_complexity="high",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=["检查代码是否合规", "审查代码合规问题", "Review code compliance"],
|
||
tags=["审查", "合规", "review"],
|
||
),
|
||
BenchmarkCase(
|
||
id="route-kw-reflex-002",
|
||
input="生成一个高精度的数据分析脚本",
|
||
expected_skill="reflexion_agent",
|
||
expected_execution_mode="reflexion",
|
||
expected_complexity="high",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=[
|
||
"写一个精确的数据分析脚本",
|
||
"生成高精度分析代码",
|
||
"Generate a precise analysis script",
|
||
],
|
||
tags=["代码生成", "精确", "code"],
|
||
),
|
||
# plan_exec_agent
|
||
BenchmarkCase(
|
||
id="route-kw-planexec-001",
|
||
input="生成一份市场分析报告",
|
||
expected_skill="plan_exec_agent",
|
||
expected_execution_mode="plan_exec",
|
||
expected_complexity="high",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=["做一份市场分析报告", "写个市场分析报告", "Generate a market analysis report"],
|
||
tags=["报告", "分析报告"],
|
||
),
|
||
BenchmarkCase(
|
||
id="route-kw-planexec-002",
|
||
input="规划产品优化方案",
|
||
expected_skill="plan_exec_agent",
|
||
expected_execution_mode="plan_exec",
|
||
expected_complexity="high",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=["制定产品优化计划", "帮我规划产品优化", "Plan product optimization"],
|
||
tags=["规划", "plan"],
|
||
),
|
||
# code_reviewer
|
||
BenchmarkCase(
|
||
id="route-kw-coderev-001",
|
||
input="Review this code for quality",
|
||
expected_skill="code_reviewer",
|
||
expected_execution_mode="direct",
|
||
expected_complexity="low",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=["审查这段代码的质量", "代码审查", "Check code quality"],
|
||
tags=["review", "代码审查"],
|
||
),
|
||
# geo_optimizer
|
||
BenchmarkCase(
|
||
id="route-kw-geo-001",
|
||
input="帮我优化这篇文章的SEO",
|
||
expected_skill="geo_optimizer",
|
||
expected_execution_mode="llm_generate",
|
||
expected_complexity="low",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=["SEO优化一下", "提升文章搜索排名", "Optimize this article for SEO"],
|
||
tags=["SEO优化", "optimize"],
|
||
),
|
||
# deai_agent
|
||
BenchmarkCase(
|
||
id="route-kw-deai-001",
|
||
input="帮我把这篇文章去AI化",
|
||
expected_skill="deai_agent",
|
||
expected_execution_mode="llm_generate",
|
||
expected_complexity="low",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=["让这段文字更自然", "改写得像人写的", "Make this text more natural"],
|
||
tags=["去AI化", "人性化"],
|
||
),
|
||
# content_generator
|
||
BenchmarkCase(
|
||
id="route-kw-content-001",
|
||
input="帮我写一篇关于AI的文章",
|
||
expected_skill="content_generator",
|
||
expected_execution_mode="llm_generate",
|
||
expected_complexity="low",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=["写一篇AI相关文章", "生成关于AI的内容", "Write an article about AI"],
|
||
tags=["写文章", "generate"],
|
||
),
|
||
# citation_detector
|
||
BenchmarkCase(
|
||
id="route-kw-citation-001",
|
||
input="检测我们的品牌在AI平台的引用情况",
|
||
expected_skill="citation_detector",
|
||
expected_execution_mode="custom",
|
||
expected_complexity="medium",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=[
|
||
"分析品牌引用率",
|
||
"哪些AI平台引用了我们",
|
||
"Check brand citation on AI platforms",
|
||
],
|
||
tags=["引用检测", "citation"],
|
||
),
|
||
# trend_agent
|
||
BenchmarkCase(
|
||
id="route-kw-trend-001",
|
||
input="分析品牌趋势",
|
||
expected_skill="trend_agent",
|
||
expected_execution_mode="tool_call",
|
||
expected_complexity="medium",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=["最近的热点话题是什么", "趋势洞察", "Analyze brand trends"],
|
||
tags=["趋势", "trend"],
|
||
),
|
||
# competitor_analyzer
|
||
BenchmarkCase(
|
||
id="route-kw-competitor-001",
|
||
input="分析我的竞品策略",
|
||
expected_skill="competitor_analyzer",
|
||
expected_execution_mode="tool_call",
|
||
expected_complexity="medium",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=["对比我和竞品的差距", "竞品分析", "Analyze competitor strategies"],
|
||
tags=["竞品", "competitor"],
|
||
),
|
||
# schema_advisor
|
||
BenchmarkCase(
|
||
id="route-kw-schema-001",
|
||
input="帮我优化Schema",
|
||
expected_skill="schema_advisor",
|
||
expected_execution_mode="custom",
|
||
expected_complexity="medium",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=["生成JSON-LD结构化数据", "Schema有什么可以改进的", "Optimize my Schema"],
|
||
tags=["Schema", "schema优化"],
|
||
),
|
||
# monitor
|
||
BenchmarkCase(
|
||
id="route-kw-monitor-001",
|
||
input="监测品牌引用变化",
|
||
expected_skill="monitor",
|
||
expected_execution_mode="custom",
|
||
expected_complexity="medium",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=["追踪效果", "品牌排名变化", "Monitor brand citation changes"],
|
||
tags=["监测", "monitor"],
|
||
),
|
||
# goal_driven_agent
|
||
BenchmarkCase(
|
||
id="route-kw-goal-001",
|
||
input="分析竞品SEO策略并生成优化方案",
|
||
expected_skill="goal_driven_agent",
|
||
expected_execution_mode="tool_call",
|
||
expected_complexity="medium",
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=[
|
||
"调研技术方案并生成对比报告",
|
||
"制定市场推广计划",
|
||
"Analyze SEO and generate plan",
|
||
],
|
||
tags=["分析", "优化方案"],
|
||
),
|
||
]
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
# Routing — Edge Cases (manually curated)
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
|
||
ROUTING_EDGE_BENCHMARKS: list[BenchmarkCase] = [
|
||
# Greeting (should NOT route to any skill)
|
||
BenchmarkCase(
|
||
id="route-edge-greet-001",
|
||
input="你好",
|
||
expected_skill=None,
|
||
expected_execution_mode="direct",
|
||
expected_complexity="low",
|
||
category="routing",
|
||
subcategory="greeting",
|
||
paraphrases=["Hello", "Hi there", "早上好"],
|
||
tags=["greeting"],
|
||
),
|
||
BenchmarkCase(
|
||
id="route-edge-greet-002",
|
||
input="Good morning!",
|
||
expected_skill=None,
|
||
expected_execution_mode="direct",
|
||
expected_complexity="low",
|
||
category="routing",
|
||
subcategory="greeting",
|
||
paraphrases=["早上好!", "你好呀"],
|
||
tags=["greeting"],
|
||
),
|
||
# Identity (should NOT route to any skill)
|
||
BenchmarkCase(
|
||
id="route-edge-identity-001",
|
||
input="你是谁?",
|
||
expected_skill=None,
|
||
expected_execution_mode="direct",
|
||
expected_complexity="low",
|
||
category="routing",
|
||
subcategory="identity",
|
||
paraphrases=["What is your name?", "介绍一下你自己", "Tell me about yourself"],
|
||
tags=["identity"],
|
||
),
|
||
# Explicit prefix
|
||
BenchmarkCase(
|
||
id="route-edge-explicit-001",
|
||
input="@skill:react_agent 搜索最新的AI新闻",
|
||
expected_skill="react_agent",
|
||
expected_execution_mode="react",
|
||
expected_complexity="high",
|
||
category="routing",
|
||
subcategory="explicit_prefix",
|
||
paraphrases=["@skill:react_agent 查找AI最新动态"],
|
||
tags=["explicit", "react"],
|
||
),
|
||
# Fallback (no matching skill)
|
||
BenchmarkCase(
|
||
id="route-edge-fallback-001",
|
||
input="告诉我一个笑话",
|
||
expected_skill=None,
|
||
expected_execution_mode="direct",
|
||
expected_complexity="low",
|
||
category="routing",
|
||
subcategory="fallback",
|
||
paraphrases=["讲个笑话", "Tell me a joke", "说个搞笑的"],
|
||
tags=["fallback"],
|
||
),
|
||
BenchmarkCase(
|
||
id="route-edge-fallback-002",
|
||
input="What is quantum physics?",
|
||
expected_skill=None,
|
||
expected_execution_mode="direct",
|
||
expected_complexity="low",
|
||
category="routing",
|
||
subcategory="fallback",
|
||
paraphrases=["量子物理是什么", "Explain quantum mechanics"],
|
||
tags=["fallback"],
|
||
),
|
||
# Disambiguation (multiple skills could match)
|
||
BenchmarkCase(
|
||
id="route-edge-disambig-001",
|
||
input="审查代码并优化SEO",
|
||
expected_skill="code_reviewer",
|
||
expected_execution_mode="direct",
|
||
expected_complexity="low",
|
||
category="routing",
|
||
subcategory="disambiguation",
|
||
paraphrases=["Review code and optimize SEO", "代码审查加SEO优化"],
|
||
tags=["disambiguation", "review", "seo"],
|
||
),
|
||
]
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
# Execution Mode Benchmarks
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
|
||
EXECUTION_BENCHMARKS: list[BenchmarkCase] = [
|
||
BenchmarkCase(
|
||
id="exec-direct-001",
|
||
input="翻译这段话成英文",
|
||
expected_skill="direct_agent",
|
||
expected_execution_mode="direct",
|
||
expected_complexity="low",
|
||
category="execution",
|
||
subcategory="direct_mode",
|
||
paraphrases=["Translate this to English", "把这段翻成英语"],
|
||
tags=["direct", "simple"],
|
||
),
|
||
BenchmarkCase(
|
||
id="exec-direct-002",
|
||
input="什么是AgentKit?",
|
||
expected_skill="direct_agent",
|
||
expected_execution_mode="direct",
|
||
expected_complexity="low",
|
||
category="execution",
|
||
subcategory="direct_mode",
|
||
paraphrases=["AgentKit是什么", "Explain AgentKit"],
|
||
tags=["direct", "qa"],
|
||
),
|
||
BenchmarkCase(
|
||
id="exec-react-001",
|
||
input="搜索并分析AI行业最新趋势",
|
||
expected_skill="react_agent",
|
||
expected_execution_mode="react",
|
||
expected_complexity="high",
|
||
category="execution",
|
||
subcategory="react_mode",
|
||
paraphrases=["Search and analyze AI trends", "调研AI行业趋势"],
|
||
tags=["react", "multi_step"],
|
||
),
|
||
BenchmarkCase(
|
||
id="exec-react-002",
|
||
input="实时监控竞品动态并生成报告",
|
||
expected_skill="react_agent",
|
||
expected_execution_mode="react",
|
||
expected_complexity="high",
|
||
category="execution",
|
||
subcategory="react_mode",
|
||
paraphrases=["Monitor competitors and report", "追踪竞品并输出报告"],
|
||
tags=["react", "monitoring"],
|
||
),
|
||
BenchmarkCase(
|
||
id="exec-rewoo-001",
|
||
input="批量采集多个竞品的功能数据",
|
||
expected_skill="rewoo_agent",
|
||
expected_execution_mode="rewoo",
|
||
expected_complexity="high",
|
||
category="execution",
|
||
subcategory="rewoo_mode",
|
||
paraphrases=["并行获取竞品数据", "Fetch competitor data in parallel"],
|
||
tags=["rewoo", "parallel"],
|
||
),
|
||
BenchmarkCase(
|
||
id="exec-reflexion-001",
|
||
input="审查代码合规性并确保高精度",
|
||
expected_skill="reflexion_agent",
|
||
expected_execution_mode="reflexion",
|
||
expected_complexity="high",
|
||
category="execution",
|
||
subcategory="reflexion_mode",
|
||
paraphrases=["高精度代码审查", "Precise code compliance review"],
|
||
tags=["reflexion", "precision"],
|
||
),
|
||
BenchmarkCase(
|
||
id="exec-planexec-001",
|
||
input="生成一份完整的市场调研报告",
|
||
expected_skill="plan_exec_agent",
|
||
expected_execution_mode="plan_exec",
|
||
expected_complexity="high",
|
||
category="execution",
|
||
subcategory="plan_exec_mode",
|
||
paraphrases=["做一份市场调研报告", "Generate a market research report"],
|
||
tags=["plan_exec", "report"],
|
||
),
|
||
BenchmarkCase(
|
||
id="exec-quality-001",
|
||
input="生成内容并确保质量达标",
|
||
expected_skill="content_generator",
|
||
expected_execution_mode="llm_generate",
|
||
expected_complexity="low",
|
||
category="execution",
|
||
subcategory="quality_gate",
|
||
paraphrases=["生成高质量内容", "Generate quality content"],
|
||
tags=["quality", "content"],
|
||
),
|
||
]
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
# Team Collaboration Benchmarks
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
|
||
TEAM_BENCHMARKS: list[BenchmarkCase] = [
|
||
BenchmarkCase(
|
||
id="team-explicit-001",
|
||
input="@team:react_agent,plan_exec_agent 协作完成深度分析并生成报告",
|
||
expected_execution_mode="react",
|
||
expected_complexity="high",
|
||
category="team",
|
||
subcategory="explicit_team",
|
||
paraphrases=[
|
||
"需要react_agent和plan_exec_agent协作",
|
||
"组建团队:搜索分析+报告生成",
|
||
],
|
||
tags=["team", "explicit"],
|
||
),
|
||
BenchmarkCase(
|
||
id="team-explicit-002",
|
||
input="@team:competitor_analyzer,trend_agent 分析竞品并追踪趋势",
|
||
expected_execution_mode="react",
|
||
expected_complexity="high",
|
||
category="team",
|
||
subcategory="explicit_team",
|
||
paraphrases=["竞品分析+趋势追踪团队", "Team for competitor and trend analysis"],
|
||
tags=["team", "explicit"],
|
||
),
|
||
BenchmarkCase(
|
||
id="team-complexity-001",
|
||
input="深度分析竞品策略、追踪品牌趋势并生成优化方案",
|
||
expected_execution_mode="react",
|
||
expected_complexity="high",
|
||
category="team",
|
||
subcategory="complexity_trigger",
|
||
paraphrases=[
|
||
"全面竞品分析和优化方案",
|
||
"Comprehensive competitor analysis with optimization",
|
||
],
|
||
tags=["team", "complexity"],
|
||
),
|
||
BenchmarkCase(
|
||
id="team-fallback-001",
|
||
input="复杂任务但无匹配专家",
|
||
expected_execution_mode="react",
|
||
expected_complexity="high",
|
||
category="team",
|
||
subcategory="fallback",
|
||
paraphrases=["需要团队但找不到合适专家", "Complex task without matching experts"],
|
||
tags=["team", "fallback"],
|
||
),
|
||
BenchmarkCase(
|
||
id="team-name-valid-001",
|
||
input="@team:react_agent,plan_exec_agent",
|
||
expected_execution_mode="react",
|
||
expected_complexity="high",
|
||
category="team",
|
||
subcategory="name_validation",
|
||
tags=["team", "validation"],
|
||
),
|
||
BenchmarkCase(
|
||
id="team-name-invalid-001",
|
||
input="@team:invalid expert name",
|
||
expected_execution_mode="direct",
|
||
expected_complexity="low",
|
||
category="team",
|
||
subcategory="name_validation",
|
||
tags=["team", "validation", "invalid"],
|
||
),
|
||
]
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
# Consistency Benchmarks
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
|
||
CONSISTENCY_BENCHMARKS: list[BenchmarkCase] = [
|
||
BenchmarkCase(
|
||
id="consist-direct-001",
|
||
input="翻译'hello world'成中文",
|
||
expected_skill="direct_agent",
|
||
expected_execution_mode="direct",
|
||
expected_complexity="low",
|
||
category="consistency",
|
||
subcategory="deterministic",
|
||
tags=["consistency", "translation"],
|
||
),
|
||
BenchmarkCase(
|
||
id="consist-direct-002",
|
||
input="什么是RAG?",
|
||
expected_skill="direct_agent",
|
||
expected_execution_mode="direct",
|
||
expected_complexity="low",
|
||
category="consistency",
|
||
subcategory="deterministic",
|
||
tags=["consistency", "qa"],
|
||
),
|
||
BenchmarkCase(
|
||
id="consist-react-001",
|
||
input="搜索AI Agent市场数据",
|
||
expected_skill="react_agent",
|
||
expected_execution_mode="react",
|
||
expected_complexity="high",
|
||
category="consistency",
|
||
subcategory="deterministic",
|
||
tags=["consistency", "search"],
|
||
),
|
||
BenchmarkCase(
|
||
id="consist-geo-001",
|
||
input="帮我优化这篇文章的SEO",
|
||
expected_skill="geo_optimizer",
|
||
expected_execution_mode="llm_generate",
|
||
expected_complexity="low",
|
||
category="consistency",
|
||
subcategory="deterministic",
|
||
tags=["consistency", "seo"],
|
||
),
|
||
BenchmarkCase(
|
||
id="consist-deai-001",
|
||
input="帮我把这篇文章去AI化",
|
||
expected_skill="deai_agent",
|
||
expected_execution_mode="llm_generate",
|
||
expected_complexity="low",
|
||
category="consistency",
|
||
subcategory="deterministic",
|
||
tags=["consistency", "deai"],
|
||
),
|
||
]
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
# Semantic Router Benchmarks
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
|
||
SEMANTIC_ROUTER_BENCHMARKS: list[BenchmarkCase] = [
|
||
BenchmarkCase(
|
||
id="semantic-direct-001",
|
||
input="简单生成任务,无需工具调用",
|
||
expected_skill="direct_agent",
|
||
expected_execution_mode="direct",
|
||
expected_complexity="low",
|
||
category="semantic_router",
|
||
subcategory="description_match",
|
||
paraphrases=["只需要一次生成的简单任务", "Single LLM call task"],
|
||
tags=["semantic", "direct"],
|
||
),
|
||
BenchmarkCase(
|
||
id="semantic-react-001",
|
||
input="需要动态适应、逐步推理和工具调用",
|
||
expected_skill="react_agent",
|
||
expected_execution_mode="react",
|
||
expected_complexity="high",
|
||
category="semantic_router",
|
||
subcategory="description_match",
|
||
paraphrases=["需要多步推理和工具", "Multi-step reasoning with tools"],
|
||
tags=["semantic", "react"],
|
||
),
|
||
BenchmarkCase(
|
||
id="semantic-rewoo-001",
|
||
input="多源数据并行采集、无依赖工具调用批量执行",
|
||
expected_skill="rewoo_agent",
|
||
expected_execution_mode="rewoo",
|
||
expected_complexity="high",
|
||
category="semantic_router",
|
||
subcategory="description_match",
|
||
paraphrases=["并行批量获取数据", "Parallel data collection"],
|
||
tags=["semantic", "rewoo"],
|
||
),
|
||
BenchmarkCase(
|
||
id="semantic-reflex-001",
|
||
input="需要高精度和自我验证的任务",
|
||
expected_skill="reflexion_agent",
|
||
expected_execution_mode="reflexion",
|
||
expected_complexity="high",
|
||
category="semantic_router",
|
||
subcategory="description_match",
|
||
paraphrases=["需要自我检查的高精度任务", "High-precision self-verification task"],
|
||
tags=["semantic", "reflexion"],
|
||
),
|
||
BenchmarkCase(
|
||
id="semantic-planexec-001",
|
||
input="结构化多步骤任务,需要可审查的规划和执行",
|
||
expected_skill="plan_exec_agent",
|
||
expected_execution_mode="plan_exec",
|
||
expected_complexity="high",
|
||
category="semantic_router",
|
||
subcategory="description_match",
|
||
paraphrases=["需要先规划再执行的任务", "Structured planning and execution"],
|
||
tags=["semantic", "plan_exec"],
|
||
),
|
||
BenchmarkCase(
|
||
id="semantic-geo-001",
|
||
input="对文章进行GEO/SEO优化,提升在AI搜索引擎中的可见性",
|
||
expected_skill="geo_optimizer",
|
||
expected_execution_mode="llm_generate",
|
||
expected_complexity="low",
|
||
category="semantic_router",
|
||
subcategory="description_match",
|
||
paraphrases=["提升内容搜索排名", "Improve content visibility in AI search"],
|
||
tags=["semantic", "geo"],
|
||
),
|
||
BenchmarkCase(
|
||
id="semantic-citation-001",
|
||
input="检测品牌在各AI平台回答中的引用情况",
|
||
expected_skill="citation_detector",
|
||
expected_execution_mode="custom",
|
||
expected_complexity="medium",
|
||
category="semantic_router",
|
||
subcategory="description_match",
|
||
paraphrases=["分析品牌被AI引用的情况", "Check brand citation across AI platforms"],
|
||
tags=["semantic", "citation"],
|
||
),
|
||
BenchmarkCase(
|
||
id="semantic-competitor-001",
|
||
input="分析竞品策略、对比品牌差距或发现竞争机会",
|
||
expected_skill="competitor_analyzer",
|
||
expected_execution_mode="tool_call",
|
||
expected_complexity="medium",
|
||
category="semantic_router",
|
||
subcategory="description_match",
|
||
paraphrases=["竞品对比和差距分析", "Competitive gap analysis"],
|
||
tags=["semantic", "competitor"],
|
||
),
|
||
]
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
# Alignment Guard Benchmarks
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
|
||
ALIGNMENT_BENCHMARKS: list[BenchmarkCase] = [
|
||
BenchmarkCase(
|
||
id="align-negative-001",
|
||
input="写一篇产品介绍,不要提及价格",
|
||
expected_skill="content_generator",
|
||
expected_execution_mode="llm_generate",
|
||
expected_complexity="low",
|
||
category="alignment",
|
||
subcategory="negative_constraint",
|
||
tags=["alignment", "negative_constraint"],
|
||
),
|
||
BenchmarkCase(
|
||
id="align-positive-001",
|
||
input="生成报告,必须包含摘要部分",
|
||
expected_skill="plan_exec_agent",
|
||
expected_execution_mode="plan_exec",
|
||
expected_complexity="high",
|
||
category="alignment",
|
||
subcategory="positive_constraint",
|
||
tags=["alignment", "positive_constraint"],
|
||
),
|
||
BenchmarkCase(
|
||
id="align-cascade-001",
|
||
input="反复搜索相同关键词",
|
||
expected_skill="react_agent",
|
||
expected_execution_mode="react",
|
||
expected_complexity="high",
|
||
category="alignment",
|
||
subcategory="cascade_detection",
|
||
tags=["alignment", "cascade"],
|
||
),
|
||
BenchmarkCase(
|
||
id="align-no-constraint-001",
|
||
input="帮我写一篇文章",
|
||
expected_skill="content_generator",
|
||
expected_execution_mode="llm_generate",
|
||
expected_complexity="low",
|
||
category="alignment",
|
||
subcategory="no_constraint",
|
||
tags=["alignment", "baseline"],
|
||
),
|
||
BenchmarkCase(
|
||
id="align-combined-001",
|
||
input="生成竞品分析报告,必须包含对比表格,不要提及内部数据",
|
||
expected_skill="competitor_analyzer",
|
||
expected_execution_mode="tool_call",
|
||
expected_complexity="medium",
|
||
category="alignment",
|
||
subcategory="combined_constraint",
|
||
tags=["alignment", "combined"],
|
||
),
|
||
]
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
# All benchmarks combined
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
|
||
ALL_BENCHMARKS: list[BenchmarkCase] = (
|
||
ROUTING_KEYWORD_BENCHMARKS
|
||
+ ROUTING_EDGE_BENCHMARKS
|
||
+ EXECUTION_BENCHMARKS
|
||
+ TEAM_BENCHMARKS
|
||
+ CONSISTENCY_BENCHMARKS
|
||
+ SEMANTIC_ROUTER_BENCHMARKS
|
||
+ ALIGNMENT_BENCHMARKS
|
||
)
|
||
|
||
|
||
def get_benchmarks_by_category(category: str) -> list[BenchmarkCase]:
|
||
"""Filter benchmarks by category."""
|
||
return [b for b in ALL_BENCHMARKS if b.category == category]
|
||
|
||
|
||
def get_benchmarks_by_subcategory(subcategory: str) -> list[BenchmarkCase]:
|
||
"""Filter benchmarks by subcategory."""
|
||
return [b for b in ALL_BENCHMARKS if b.subcategory == subcategory]
|
||
|
||
|
||
def get_benchmarks_with_paraphrases() -> list[BenchmarkCase]:
|
||
"""Get only benchmarks that have paraphrases (for overfitting detection)."""
|
||
return [b for b in ALL_BENCHMARKS if b.paraphrases]
|
||
|
||
|
||
def get_skill_names_needed() -> set[str]:
|
||
"""Get all skill names referenced in benchmarks (for pre-registration)."""
|
||
return {b.expected_skill for b in ALL_BENCHMARKS if b.expected_skill is not None}
|
||
|
||
|
||
def get_benchmark_stats() -> dict[str, int]:
|
||
"""Get benchmark count by category."""
|
||
stats: dict[str, int] = {}
|
||
for b in ALL_BENCHMARKS:
|
||
stats[b.category] = stats.get(b.category, 0) + 1
|
||
stats["total"] = len(ALL_BENCHMARKS)
|
||
return stats
|