fischer-agentkit/tests/e2e/benchmark_generator.py

"""Benchmark Generator — Auto-generate benchmark cases from skill configs.

Reads configs/skills/*.yaml, extracts intent.keywords/description/examples,
and generates BenchmarkCase objects aligned with actual skill configurations.

This ensures the benchmark dataset stays in sync with the real skill registry.
"""

from pathlib import Path

import yaml
from pydantic import BaseModel, ConfigDict

from tests.e2e.benchmark_dataset import BenchmarkCase


# ═══════════════════════════════════════════════════════════════════════════
# Skill Config Model
# ═══════════════════════════════════════════════════════════════════════════


class SkillIntent(BaseModel):
    """Intent section of a skill config."""

    model_config = ConfigDict(extra="ignore")

    keywords: list[str] = []
    description: str = ""
    examples: list[str] = []


class SkillConfig(BaseModel):
    """Minimal skill config model for benchmark generation."""

    model_config = ConfigDict(extra="ignore")

    name: str
    description: str = ""
    execution_mode: str = "direct"
    task_mode: str = "llm_generate"
    intent: SkillIntent = SkillIntent()


# ═══════════════════════════════════════════════════════════════════════════
# Complexity Mapping
# ═══════════════════════════════════════════════════════════════════════════

EXECUTION_MODE_TO_COMPLEXITY: dict[str, str] = {
    "direct": "low",
    "react": "high",
    "rewoo": "high",
    "reflexion": "high",
    "plan_exec": "high",
    "tool_call": "medium",
    "llm_generate": "low",
    "custom": "medium",
}

# Paraphrase templates for auto-generating paraphrases from examples
PARAPHRASE_TEMPLATES_CN: list[str] = [
    "请帮我{action}",
    "我需要{action}",
    "能不能{action}",
]

PARAPHRASE_TEMPLATES_EN: list[str] = [
    "Please help me {action}",
    "I need to {action}",
    "Can you {action}",
]


# ═══════════════════════════════════════════════════════════════════════════
# Benchmark Generator
# ═══════════════════════════════════════════════════════════════════════════


class BenchmarkGenerator:
    """Generate benchmark cases from skill config YAML files."""

    def __init__(self, configs_dir: str | None = None) -> None:
        if configs_dir is None:
            # Default: project_root/configs/skills/
            project_root = Path(__file__).parent.parent.parent.parent
            configs_dir = str(project_root / "configs" / "skills")
        self.configs_dir = configs_dir
        self._skills: list[SkillConfig] = []
        self._loaded = False

    def load_skills(self) -> list[SkillConfig]:
        """Load all skill configs from YAML files."""
        if self._loaded:
            return self._skills

        skills_dir = Path(self.configs_dir)
        if not skills_dir.exists():
            return self._skills

        for yaml_file in sorted(skills_dir.glob("*.yaml")):
            with open(yaml_file, encoding="utf-8") as f:
                data = yaml.safe_load(f)
            if data and isinstance(data, dict):
                try:
                    skill = SkillConfig(**data)
                    self._skills.append(skill)
                except Exception:
                    continue

        self._loaded = True
        return self._skills

    def _get_effective_execution_mode(self, skill: SkillConfig) -> str:
        """Get the effective execution mode for a skill."""
        if skill.execution_mode and skill.execution_mode != "direct":
            return skill.execution_mode
        # Map task_mode to execution mode
        return skill.task_mode if skill.task_mode else "direct"

    def _generate_paraphrases(self, example: str, keywords: list[str]) -> list[str]:
        """Generate paraphrases for an example query."""
        paraphrases: list[str] = []

        # Simple paraphrase generation: add prefix variations
        is_chinese = any("\u4e00" <= c <= "\u9fff" for c in example)

        if is_chinese:
            # Chinese paraphrases
            if not example.startswith("请") and not example.startswith("帮"):
                paraphrases.append(f"请{example}")
            if not example.startswith("我"):
                paraphrases.append(f"我需要{example}")
            # Add keyword-based variant
            if keywords:
                kw = keywords[0]
                if kw not in example:
                    paraphrases.append(f"关于{kw}，{example}")
        else:
            # English paraphrases
            lower = example.lower()
            if not lower.startswith("please") and not lower.startswith("can you"):
                paraphrases.append(f"Please {example[0].lower()}{example[1:]}")
            if not lower.startswith("i need"):
                paraphrases.append(f"I need to {example[0].lower()}{example[1:]}")

        return paraphrases[:3]  # Max 3 paraphrases per example

    def generate_routing_benchmarks(self) -> list[BenchmarkCase]:
        """Generate routing benchmark cases from all skills."""
        skills = self.load_skills()
        cases: list[BenchmarkCase] = []
        case_counter = 0

        for skill in skills:
            exec_mode = self._get_effective_execution_mode(skill)
            complexity = EXECUTION_MODE_TO_COMPLEXITY.get(exec_mode, "low")

            # Generate from intent.examples
            for example in skill.intent.examples:
                case_counter += 1
                paraphrases = self._generate_paraphrases(example, skill.intent.keywords)
                cases.append(
                    BenchmarkCase(
                        id=f"route-auto-{case_counter:03d}",
                        input=example,
                        expected_skill=skill.name,
                        expected_execution_mode=exec_mode,
                        expected_complexity=complexity,
                        category="routing",
                        subcategory="keyword_match",
                        paraphrases=paraphrases,
                        tags=skill.intent.keywords[:3],
                    )
                )

            # Generate from intent.keywords (one case per keyword)
            for keyword in skill.intent.keywords:
                case_counter += 1
                query = (
                    f"帮我{keyword}"
                    if any("\u4e00" <= c <= "\u9fff" for c in keyword)
                    else f"Help me {keyword}"
                )
                cases.append(
                    BenchmarkCase(
                        id=f"route-kw-auto-{case_counter:03d}",
                        input=query,
                        expected_skill=skill.name,
                        expected_execution_mode=exec_mode,
                        expected_complexity=complexity,
                        category="routing",
                        subcategory="keyword_match",
                        tags=[keyword],
                    )
                )

        return cases

    def generate_execution_benchmarks(self) -> list[BenchmarkCase]:
        """Generate execution mode benchmark cases."""
        skills = self.load_skills()
        cases: list[BenchmarkCase] = []
        case_counter = 0

        # Group skills by execution mode
        mode_groups: dict[str, list[SkillConfig]] = {}
        for skill in skills:
            mode = self._get_effective_execution_mode(skill)
            mode_groups.setdefault(mode, []).append(skill)

        for mode, group in mode_groups.items():
            complexity = EXECUTION_MODE_TO_COMPLEXITY.get(mode, "low")
            for skill in group[:2]:  # Max 2 skills per mode
                if skill.intent.examples:
                    case_counter += 1
                    cases.append(
                        BenchmarkCase(
                            id=f"exec-auto-{case_counter:03d}",
                            input=skill.intent.examples[0],
                            expected_skill=skill.name,
                            expected_execution_mode=mode,
                            expected_complexity=complexity,
                            category="execution",
                            subcategory=f"{mode}_mode",
                            paraphrases=skill.intent.examples[1:2],
                            tags=[mode],
                        )
                    )

        return cases

    def generate_team_benchmarks(self) -> list[BenchmarkCase]:
        """Generate team collaboration benchmark cases."""
        skills = self.load_skills()
        cases: list[BenchmarkCase] = []
        case_counter = 0

        # High-complexity skills suitable for team collaboration
        high_complexity_skills = [
            s
            for s in skills
            if EXECUTION_MODE_TO_COMPLEXITY.get(self._get_effective_execution_mode(s), "low")
            == "high"
        ]

        if len(high_complexity_skills) >= 2:
            skill_a, skill_b = high_complexity_skills[0], high_complexity_skills[1]
            case_counter += 1
            cases.append(
                BenchmarkCase(
                    id=f"team-auto-{case_counter:03d}",
                    input=f"@team:{skill_a.name},{skill_b.name} 协作完成复杂分析任务",
                    expected_execution_mode="react",
                    expected_complexity="high",
                    category="team",
                    subcategory="explicit_team",
                    paraphrases=[
                        f"需要{skill_a.name}和{skill_b.name}协作分析",
                        f"组建团队：{skill_a.name} + {skill_b.name}",
                    ],
                    tags=["team", skill_a.name, skill_b.name],
                )
            )

        # Complexity-triggered team
        if high_complexity_skills:
            skill = high_complexity_skills[0]
            case_counter += 1
            cases.append(
                BenchmarkCase(
                    id=f"team-complexity-{case_counter:03d}",
                    input=f"深度{skill.intent.keywords[0] if skill.intent.keywords else '分析'}并生成详细报告",
                    expected_execution_mode="react",
                    expected_complexity="high",
                    category="team",
                    subcategory="complexity_trigger",
                    paraphrases=[
                        f"全面{skill.intent.keywords[0] if skill.intent.keywords else '分析'}并输出报告",
                    ],
                    tags=["team", "complexity"],
                )
            )

        return cases

    def generate_semantic_benchmarks(self) -> list[BenchmarkCase]:
        """Generate semantic router specific benchmark cases."""
        skills = self.load_skills()
        cases: list[BenchmarkCase] = []
        case_counter = 0

        for skill in skills:
            if not skill.intent.description:
                continue
            case_counter += 1
            # Use description as input (tests semantic matching, not keyword matching)
            cases.append(
                BenchmarkCase(
                    id=f"semantic-auto-{case_counter:03d}",
                    input=skill.intent.description,
                    expected_skill=skill.name,
                    expected_execution_mode=self._get_effective_execution_mode(skill),
                    expected_complexity=EXECUTION_MODE_TO_COMPLEXITY.get(
                        self._get_effective_execution_mode(skill), "low"
                    ),
                    category="semantic_router",
                    subcategory="description_match",
                    tags=["semantic", skill.name],
                )
            )

        return cases

    def generate_all(self) -> list[BenchmarkCase]:
        """Generate all auto-generated benchmark cases."""
        cases: list[BenchmarkCase] = []
        cases.extend(self.generate_routing_benchmarks())
        cases.extend(self.generate_execution_benchmarks())
        cases.extend(self.generate_team_benchmarks())
        cases.extend(self.generate_semantic_benchmarks())
        return cases

    def get_skill_names(self) -> set[str]:
        """Get all skill names from configs."""
        return {s.name for s in self.load_skills()}


# ═══════════════════════════════════════════════════════════════════════════
# Singleton for reuse
# ═══════════════════════════════════════════════════════════════════════════

_generator: BenchmarkGenerator | None = None


def get_generator() -> BenchmarkGenerator:
    """Get or create the singleton BenchmarkGenerator."""
    global _generator
    if _generator is None:
        _generator = BenchmarkGenerator()
    return _generator