340 lines
14 KiB
Python
340 lines
14 KiB
Python
"""Benchmark Generator — Auto-generate benchmark cases from skill configs.
|
||
|
||
Reads configs/skills/*.yaml, extracts intent.keywords/description/examples,
|
||
and generates BenchmarkCase objects aligned with actual skill configurations.
|
||
|
||
This ensures the benchmark dataset stays in sync with the real skill registry.
|
||
"""
|
||
|
||
from pathlib import Path
|
||
|
||
import yaml
|
||
from pydantic import BaseModel, ConfigDict
|
||
|
||
from tests.e2e.benchmark_dataset import BenchmarkCase
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
# Skill Config Model
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
|
||
|
||
class SkillIntent(BaseModel):
|
||
"""Intent section of a skill config."""
|
||
|
||
model_config = ConfigDict(extra="ignore")
|
||
|
||
keywords: list[str] = []
|
||
description: str = ""
|
||
examples: list[str] = []
|
||
|
||
|
||
class SkillConfig(BaseModel):
|
||
"""Minimal skill config model for benchmark generation."""
|
||
|
||
model_config = ConfigDict(extra="ignore")
|
||
|
||
name: str
|
||
description: str = ""
|
||
execution_mode: str = "direct"
|
||
task_mode: str = "llm_generate"
|
||
intent: SkillIntent = SkillIntent()
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
# Complexity Mapping
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
|
||
EXECUTION_MODE_TO_COMPLEXITY: dict[str, str] = {
|
||
"direct": "low",
|
||
"react": "high",
|
||
"rewoo": "high",
|
||
"reflexion": "high",
|
||
"plan_exec": "high",
|
||
"tool_call": "medium",
|
||
"llm_generate": "low",
|
||
"custom": "medium",
|
||
}
|
||
|
||
# Paraphrase templates for auto-generating paraphrases from examples
|
||
PARAPHRASE_TEMPLATES_CN: list[str] = [
|
||
"请帮我{action}",
|
||
"我需要{action}",
|
||
"能不能{action}",
|
||
]
|
||
|
||
PARAPHRASE_TEMPLATES_EN: list[str] = [
|
||
"Please help me {action}",
|
||
"I need to {action}",
|
||
"Can you {action}",
|
||
]
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
# Benchmark Generator
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
|
||
|
||
class BenchmarkGenerator:
|
||
"""Generate benchmark cases from skill config YAML files."""
|
||
|
||
def __init__(self, configs_dir: str | None = None) -> None:
|
||
if configs_dir is None:
|
||
# Default: project_root/configs/skills/
|
||
project_root = Path(__file__).parent.parent.parent.parent
|
||
configs_dir = str(project_root / "configs" / "skills")
|
||
self.configs_dir = configs_dir
|
||
self._skills: list[SkillConfig] = []
|
||
self._loaded = False
|
||
|
||
def load_skills(self) -> list[SkillConfig]:
|
||
"""Load all skill configs from YAML files."""
|
||
if self._loaded:
|
||
return self._skills
|
||
|
||
skills_dir = Path(self.configs_dir)
|
||
if not skills_dir.exists():
|
||
return self._skills
|
||
|
||
for yaml_file in sorted(skills_dir.glob("*.yaml")):
|
||
with open(yaml_file, encoding="utf-8") as f:
|
||
data = yaml.safe_load(f)
|
||
if data and isinstance(data, dict):
|
||
try:
|
||
skill = SkillConfig(**data)
|
||
self._skills.append(skill)
|
||
except Exception:
|
||
continue
|
||
|
||
self._loaded = True
|
||
return self._skills
|
||
|
||
def _get_effective_execution_mode(self, skill: SkillConfig) -> str:
|
||
"""Get the effective execution mode for a skill."""
|
||
if skill.execution_mode and skill.execution_mode != "direct":
|
||
return skill.execution_mode
|
||
# Map task_mode to execution mode
|
||
return skill.task_mode if skill.task_mode else "direct"
|
||
|
||
def _generate_paraphrases(self, example: str, keywords: list[str]) -> list[str]:
|
||
"""Generate paraphrases for an example query."""
|
||
paraphrases: list[str] = []
|
||
|
||
# Simple paraphrase generation: add prefix variations
|
||
is_chinese = any("\u4e00" <= c <= "\u9fff" for c in example)
|
||
|
||
if is_chinese:
|
||
# Chinese paraphrases
|
||
if not example.startswith("请") and not example.startswith("帮"):
|
||
paraphrases.append(f"请{example}")
|
||
if not example.startswith("我"):
|
||
paraphrases.append(f"我需要{example}")
|
||
# Add keyword-based variant
|
||
if keywords:
|
||
kw = keywords[0]
|
||
if kw not in example:
|
||
paraphrases.append(f"关于{kw},{example}")
|
||
else:
|
||
# English paraphrases
|
||
lower = example.lower()
|
||
if not lower.startswith("please") and not lower.startswith("can you"):
|
||
paraphrases.append(f"Please {example[0].lower()}{example[1:]}")
|
||
if not lower.startswith("i need"):
|
||
paraphrases.append(f"I need to {example[0].lower()}{example[1:]}")
|
||
|
||
return paraphrases[:3] # Max 3 paraphrases per example
|
||
|
||
def generate_routing_benchmarks(self) -> list[BenchmarkCase]:
|
||
"""Generate routing benchmark cases from all skills."""
|
||
skills = self.load_skills()
|
||
cases: list[BenchmarkCase] = []
|
||
case_counter = 0
|
||
|
||
for skill in skills:
|
||
exec_mode = self._get_effective_execution_mode(skill)
|
||
complexity = EXECUTION_MODE_TO_COMPLEXITY.get(exec_mode, "low")
|
||
|
||
# Generate from intent.examples
|
||
for example in skill.intent.examples:
|
||
case_counter += 1
|
||
paraphrases = self._generate_paraphrases(example, skill.intent.keywords)
|
||
cases.append(
|
||
BenchmarkCase(
|
||
id=f"route-auto-{case_counter:03d}",
|
||
input=example,
|
||
expected_skill=skill.name,
|
||
expected_execution_mode=exec_mode,
|
||
expected_complexity=complexity,
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
paraphrases=paraphrases,
|
||
tags=skill.intent.keywords[:3],
|
||
)
|
||
)
|
||
|
||
# Generate from intent.keywords (one case per keyword)
|
||
for keyword in skill.intent.keywords:
|
||
case_counter += 1
|
||
query = (
|
||
f"帮我{keyword}"
|
||
if any("\u4e00" <= c <= "\u9fff" for c in keyword)
|
||
else f"Help me {keyword}"
|
||
)
|
||
cases.append(
|
||
BenchmarkCase(
|
||
id=f"route-kw-auto-{case_counter:03d}",
|
||
input=query,
|
||
expected_skill=skill.name,
|
||
expected_execution_mode=exec_mode,
|
||
expected_complexity=complexity,
|
||
category="routing",
|
||
subcategory="keyword_match",
|
||
tags=[keyword],
|
||
)
|
||
)
|
||
|
||
return cases
|
||
|
||
def generate_execution_benchmarks(self) -> list[BenchmarkCase]:
|
||
"""Generate execution mode benchmark cases."""
|
||
skills = self.load_skills()
|
||
cases: list[BenchmarkCase] = []
|
||
case_counter = 0
|
||
|
||
# Group skills by execution mode
|
||
mode_groups: dict[str, list[SkillConfig]] = {}
|
||
for skill in skills:
|
||
mode = self._get_effective_execution_mode(skill)
|
||
mode_groups.setdefault(mode, []).append(skill)
|
||
|
||
for mode, group in mode_groups.items():
|
||
complexity = EXECUTION_MODE_TO_COMPLEXITY.get(mode, "low")
|
||
for skill in group[:2]: # Max 2 skills per mode
|
||
if skill.intent.examples:
|
||
case_counter += 1
|
||
cases.append(
|
||
BenchmarkCase(
|
||
id=f"exec-auto-{case_counter:03d}",
|
||
input=skill.intent.examples[0],
|
||
expected_skill=skill.name,
|
||
expected_execution_mode=mode,
|
||
expected_complexity=complexity,
|
||
category="execution",
|
||
subcategory=f"{mode}_mode",
|
||
paraphrases=skill.intent.examples[1:2],
|
||
tags=[mode],
|
||
)
|
||
)
|
||
|
||
return cases
|
||
|
||
def generate_team_benchmarks(self) -> list[BenchmarkCase]:
|
||
"""Generate team collaboration benchmark cases."""
|
||
skills = self.load_skills()
|
||
cases: list[BenchmarkCase] = []
|
||
case_counter = 0
|
||
|
||
# High-complexity skills suitable for team collaboration
|
||
high_complexity_skills = [
|
||
s
|
||
for s in skills
|
||
if EXECUTION_MODE_TO_COMPLEXITY.get(self._get_effective_execution_mode(s), "low")
|
||
== "high"
|
||
]
|
||
|
||
if len(high_complexity_skills) >= 2:
|
||
skill_a, skill_b = high_complexity_skills[0], high_complexity_skills[1]
|
||
case_counter += 1
|
||
cases.append(
|
||
BenchmarkCase(
|
||
id=f"team-auto-{case_counter:03d}",
|
||
input=f"@team:{skill_a.name},{skill_b.name} 协作完成复杂分析任务",
|
||
expected_execution_mode="react",
|
||
expected_complexity="high",
|
||
category="team",
|
||
subcategory="explicit_team",
|
||
paraphrases=[
|
||
f"需要{skill_a.name}和{skill_b.name}协作分析",
|
||
f"组建团队:{skill_a.name} + {skill_b.name}",
|
||
],
|
||
tags=["team", skill_a.name, skill_b.name],
|
||
)
|
||
)
|
||
|
||
# Complexity-triggered team
|
||
if high_complexity_skills:
|
||
skill = high_complexity_skills[0]
|
||
case_counter += 1
|
||
cases.append(
|
||
BenchmarkCase(
|
||
id=f"team-complexity-{case_counter:03d}",
|
||
input=f"深度{skill.intent.keywords[0] if skill.intent.keywords else '分析'}并生成详细报告",
|
||
expected_execution_mode="react",
|
||
expected_complexity="high",
|
||
category="team",
|
||
subcategory="complexity_trigger",
|
||
paraphrases=[
|
||
f"全面{skill.intent.keywords[0] if skill.intent.keywords else '分析'}并输出报告",
|
||
],
|
||
tags=["team", "complexity"],
|
||
)
|
||
)
|
||
|
||
return cases
|
||
|
||
def generate_semantic_benchmarks(self) -> list[BenchmarkCase]:
|
||
"""Generate semantic router specific benchmark cases."""
|
||
skills = self.load_skills()
|
||
cases: list[BenchmarkCase] = []
|
||
case_counter = 0
|
||
|
||
for skill in skills:
|
||
if not skill.intent.description:
|
||
continue
|
||
case_counter += 1
|
||
# Use description as input (tests semantic matching, not keyword matching)
|
||
cases.append(
|
||
BenchmarkCase(
|
||
id=f"semantic-auto-{case_counter:03d}",
|
||
input=skill.intent.description,
|
||
expected_skill=skill.name,
|
||
expected_execution_mode=self._get_effective_execution_mode(skill),
|
||
expected_complexity=EXECUTION_MODE_TO_COMPLEXITY.get(
|
||
self._get_effective_execution_mode(skill), "low"
|
||
),
|
||
category="semantic_router",
|
||
subcategory="description_match",
|
||
tags=["semantic", skill.name],
|
||
)
|
||
)
|
||
|
||
return cases
|
||
|
||
def generate_all(self) -> list[BenchmarkCase]:
|
||
"""Generate all auto-generated benchmark cases."""
|
||
cases: list[BenchmarkCase] = []
|
||
cases.extend(self.generate_routing_benchmarks())
|
||
cases.extend(self.generate_execution_benchmarks())
|
||
cases.extend(self.generate_team_benchmarks())
|
||
cases.extend(self.generate_semantic_benchmarks())
|
||
return cases
|
||
|
||
def get_skill_names(self) -> set[str]:
|
||
"""Get all skill names from configs."""
|
||
return {s.name for s in self.load_skills()}
|
||
|
||
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
# Singleton for reuse
|
||
# ═══════════════════════════════════════════════════════════════════════════
|
||
|
||
_generator: BenchmarkGenerator | None = None
|
||
|
||
|
||
def get_generator() -> BenchmarkGenerator:
|
||
"""Get or create the singleton BenchmarkGenerator."""
|
||
global _generator
|
||
if _generator is None:
|
||
_generator = BenchmarkGenerator()
|
||
return _generator
|