fischer-agentkit/tests/e2e/benchmark_generator.py

340 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Benchmark Generator — Auto-generate benchmark cases from skill configs.
Reads configs/skills/*.yaml, extracts intent.keywords/description/examples,
and generates BenchmarkCase objects aligned with actual skill configurations.
This ensures the benchmark dataset stays in sync with the real skill registry.
"""
from pathlib import Path
import yaml
from pydantic import BaseModel, ConfigDict
from tests.e2e.benchmark_dataset import BenchmarkCase
# ═══════════════════════════════════════════════════════════════════════════
# Skill Config Model
# ═══════════════════════════════════════════════════════════════════════════
class SkillIntent(BaseModel):
"""Intent section of a skill config."""
model_config = ConfigDict(extra="ignore")
keywords: list[str] = []
description: str = ""
examples: list[str] = []
class SkillConfig(BaseModel):
"""Minimal skill config model for benchmark generation."""
model_config = ConfigDict(extra="ignore")
name: str
description: str = ""
execution_mode: str = "direct"
task_mode: str = "llm_generate"
intent: SkillIntent = SkillIntent()
# ═══════════════════════════════════════════════════════════════════════════
# Complexity Mapping
# ═══════════════════════════════════════════════════════════════════════════
EXECUTION_MODE_TO_COMPLEXITY: dict[str, str] = {
"direct": "low",
"react": "high",
"rewoo": "high",
"reflexion": "high",
"plan_exec": "high",
"tool_call": "medium",
"llm_generate": "low",
"custom": "medium",
}
# Paraphrase templates for auto-generating paraphrases from examples
PARAPHRASE_TEMPLATES_CN: list[str] = [
"请帮我{action}",
"我需要{action}",
"能不能{action}",
]
PARAPHRASE_TEMPLATES_EN: list[str] = [
"Please help me {action}",
"I need to {action}",
"Can you {action}",
]
# ═══════════════════════════════════════════════════════════════════════════
# Benchmark Generator
# ═══════════════════════════════════════════════════════════════════════════
class BenchmarkGenerator:
"""Generate benchmark cases from skill config YAML files."""
def __init__(self, configs_dir: str | None = None) -> None:
if configs_dir is None:
# Default: project_root/configs/skills/
project_root = Path(__file__).parent.parent.parent.parent
configs_dir = str(project_root / "configs" / "skills")
self.configs_dir = configs_dir
self._skills: list[SkillConfig] = []
self._loaded = False
def load_skills(self) -> list[SkillConfig]:
"""Load all skill configs from YAML files."""
if self._loaded:
return self._skills
skills_dir = Path(self.configs_dir)
if not skills_dir.exists():
return self._skills
for yaml_file in sorted(skills_dir.glob("*.yaml")):
with open(yaml_file, encoding="utf-8") as f:
data = yaml.safe_load(f)
if data and isinstance(data, dict):
try:
skill = SkillConfig(**data)
self._skills.append(skill)
except Exception:
continue
self._loaded = True
return self._skills
def _get_effective_execution_mode(self, skill: SkillConfig) -> str:
"""Get the effective execution mode for a skill."""
if skill.execution_mode and skill.execution_mode != "direct":
return skill.execution_mode
# Map task_mode to execution mode
return skill.task_mode if skill.task_mode else "direct"
def _generate_paraphrases(self, example: str, keywords: list[str]) -> list[str]:
"""Generate paraphrases for an example query."""
paraphrases: list[str] = []
# Simple paraphrase generation: add prefix variations
is_chinese = any("\u4e00" <= c <= "\u9fff" for c in example)
if is_chinese:
# Chinese paraphrases
if not example.startswith("") and not example.startswith(""):
paraphrases.append(f"{example}")
if not example.startswith(""):
paraphrases.append(f"我需要{example}")
# Add keyword-based variant
if keywords:
kw = keywords[0]
if kw not in example:
paraphrases.append(f"关于{kw}{example}")
else:
# English paraphrases
lower = example.lower()
if not lower.startswith("please") and not lower.startswith("can you"):
paraphrases.append(f"Please {example[0].lower()}{example[1:]}")
if not lower.startswith("i need"):
paraphrases.append(f"I need to {example[0].lower()}{example[1:]}")
return paraphrases[:3] # Max 3 paraphrases per example
def generate_routing_benchmarks(self) -> list[BenchmarkCase]:
"""Generate routing benchmark cases from all skills."""
skills = self.load_skills()
cases: list[BenchmarkCase] = []
case_counter = 0
for skill in skills:
exec_mode = self._get_effective_execution_mode(skill)
complexity = EXECUTION_MODE_TO_COMPLEXITY.get(exec_mode, "low")
# Generate from intent.examples
for example in skill.intent.examples:
case_counter += 1
paraphrases = self._generate_paraphrases(example, skill.intent.keywords)
cases.append(
BenchmarkCase(
id=f"route-auto-{case_counter:03d}",
input=example,
expected_skill=skill.name,
expected_execution_mode=exec_mode,
expected_complexity=complexity,
category="routing",
subcategory="keyword_match",
paraphrases=paraphrases,
tags=skill.intent.keywords[:3],
)
)
# Generate from intent.keywords (one case per keyword)
for keyword in skill.intent.keywords:
case_counter += 1
query = (
f"帮我{keyword}"
if any("\u4e00" <= c <= "\u9fff" for c in keyword)
else f"Help me {keyword}"
)
cases.append(
BenchmarkCase(
id=f"route-kw-auto-{case_counter:03d}",
input=query,
expected_skill=skill.name,
expected_execution_mode=exec_mode,
expected_complexity=complexity,
category="routing",
subcategory="keyword_match",
tags=[keyword],
)
)
return cases
def generate_execution_benchmarks(self) -> list[BenchmarkCase]:
"""Generate execution mode benchmark cases."""
skills = self.load_skills()
cases: list[BenchmarkCase] = []
case_counter = 0
# Group skills by execution mode
mode_groups: dict[str, list[SkillConfig]] = {}
for skill in skills:
mode = self._get_effective_execution_mode(skill)
mode_groups.setdefault(mode, []).append(skill)
for mode, group in mode_groups.items():
complexity = EXECUTION_MODE_TO_COMPLEXITY.get(mode, "low")
for skill in group[:2]: # Max 2 skills per mode
if skill.intent.examples:
case_counter += 1
cases.append(
BenchmarkCase(
id=f"exec-auto-{case_counter:03d}",
input=skill.intent.examples[0],
expected_skill=skill.name,
expected_execution_mode=mode,
expected_complexity=complexity,
category="execution",
subcategory=f"{mode}_mode",
paraphrases=skill.intent.examples[1:2],
tags=[mode],
)
)
return cases
def generate_team_benchmarks(self) -> list[BenchmarkCase]:
"""Generate team collaboration benchmark cases."""
skills = self.load_skills()
cases: list[BenchmarkCase] = []
case_counter = 0
# High-complexity skills suitable for team collaboration
high_complexity_skills = [
s
for s in skills
if EXECUTION_MODE_TO_COMPLEXITY.get(self._get_effective_execution_mode(s), "low")
== "high"
]
if len(high_complexity_skills) >= 2:
skill_a, skill_b = high_complexity_skills[0], high_complexity_skills[1]
case_counter += 1
cases.append(
BenchmarkCase(
id=f"team-auto-{case_counter:03d}",
input=f"@team:{skill_a.name},{skill_b.name} 协作完成复杂分析任务",
expected_execution_mode="react",
expected_complexity="high",
category="team",
subcategory="explicit_team",
paraphrases=[
f"需要{skill_a.name}{skill_b.name}协作分析",
f"组建团队:{skill_a.name} + {skill_b.name}",
],
tags=["team", skill_a.name, skill_b.name],
)
)
# Complexity-triggered team
if high_complexity_skills:
skill = high_complexity_skills[0]
case_counter += 1
cases.append(
BenchmarkCase(
id=f"team-complexity-{case_counter:03d}",
input=f"深度{skill.intent.keywords[0] if skill.intent.keywords else '分析'}并生成详细报告",
expected_execution_mode="react",
expected_complexity="high",
category="team",
subcategory="complexity_trigger",
paraphrases=[
f"全面{skill.intent.keywords[0] if skill.intent.keywords else '分析'}并输出报告",
],
tags=["team", "complexity"],
)
)
return cases
def generate_semantic_benchmarks(self) -> list[BenchmarkCase]:
"""Generate semantic router specific benchmark cases."""
skills = self.load_skills()
cases: list[BenchmarkCase] = []
case_counter = 0
for skill in skills:
if not skill.intent.description:
continue
case_counter += 1
# Use description as input (tests semantic matching, not keyword matching)
cases.append(
BenchmarkCase(
id=f"semantic-auto-{case_counter:03d}",
input=skill.intent.description,
expected_skill=skill.name,
expected_execution_mode=self._get_effective_execution_mode(skill),
expected_complexity=EXECUTION_MODE_TO_COMPLEXITY.get(
self._get_effective_execution_mode(skill), "low"
),
category="semantic_router",
subcategory="description_match",
tags=["semantic", skill.name],
)
)
return cases
def generate_all(self) -> list[BenchmarkCase]:
"""Generate all auto-generated benchmark cases."""
cases: list[BenchmarkCase] = []
cases.extend(self.generate_routing_benchmarks())
cases.extend(self.generate_execution_benchmarks())
cases.extend(self.generate_team_benchmarks())
cases.extend(self.generate_semantic_benchmarks())
return cases
def get_skill_names(self) -> set[str]:
"""Get all skill names from configs."""
return {s.name for s in self.load_skills()}
# ═══════════════════════════════════════════════════════════════════════════
# Singleton for reuse
# ═══════════════════════════════════════════════════════════════════════════
_generator: BenchmarkGenerator | None = None
def get_generator() -> BenchmarkGenerator:
"""Get or create the singleton BenchmarkGenerator."""
global _generator
if _generator is None:
_generator = BenchmarkGenerator()
return _generator