fischer-agentkit/src/agentkit/experts/orchestrator.py

"""TeamOrchestrator - 流水线模式专家团队执行引擎

驱动 ExpertTeam 在流水线模式下执行任务：

1. Lead Expert 接收任务，分解为阶段（PlanPhase），阶段间有依赖关系（depends_on）
2. 按依赖拓扑排序，同层无依赖阶段并行（asyncio.gather），层间串行
3. 每个阶段创建独立 ConfigDrivenAgent 实例（上下文隔离，KTD3）
4. 阶段间数据通过 SharedWorkspace 传递（{task_id}/phase/{phase_id}/output）
5. Lead Expert 汇总所有阶段结果（BEST 策略）
6. 返回最终结果

生命周期：FORMING → PLANNING → EXECUTING → SYNTHESIZING → COMPLETED

设计依据：
- KTD2: Lead 分解为阶段而非子任务，支持流水线串行阶段
- KTD3: 上下文隔离，独立 ConfigDrivenAgent 实例
- KTD6: PLANNING 状态在分解阶段设置
"""

from __future__ import annotations

import asyncio
import copy
import json
import logging
import re
from datetime import datetime, timezone
from typing import Any

from agentkit.core.config_driven import ConfigDrivenAgent
from agentkit.core.protocol import TaskMessage, TaskResult, TaskStatus
from agentkit.llm.gateway import LLMGateway

from .expert import Expert
from .plan import PhaseStatus, PlanPhase, PlanStatus, TeamPlan
from .team import ExpertTeam, TeamStatus

logger = logging.getLogger(__name__)


class TeamOrchestrator:
    """Pipeline orchestration engine.

    Lead Expert decomposes the task into phases with dependencies (depends_on).
    Phases are executed in topological order: same-layer phases run in parallel
    (asyncio.gather), layers run sequentially. Each phase gets an independent
    ConfigDrivenAgent instance for context isolation (KTD3).
    """

    MAX_PHASES = 10  # Maximum phases Lead Expert can decompose
    MAX_RETRIES = 1  # Retry once on phase failure before marking failed

    def __init__(self, team: ExpertTeam) -> None:
        self._team = team
        # Track temporary agent names created for context isolation (KTD3)
        # Maps phase_id -> temp_agent_name for cleanup
        self._temp_agents: dict[str, str] = {}

    async def execute(self, task: str) -> dict[str, Any]:
        """Execute a task in pipeline mode.

        Flow:
        1. Emit team_formed event
        2. Set PLANNING status, Lead Expert decomposes task into phases
        3. Emit plan_update with phase list
        4. Set EXECUTING status, topological sort, execute layers:
           - Same-layer phases parallel (asyncio.gather)
           - Layer-by-layer sequential
        5. Set SYNTHESIZING status, Lead synthesizes results (BEST strategy)
        6. Set COMPLETED status, emit team_synthesis event

        Returns a dict with:
        - "status": "completed" | "failed" | "fallback"
        - "result": final synthesized result
        - "phase_results": dict of phase_id -> result
        - "plan": TeamPlan instance
        """
        lead = self._team.lead_expert
        if not lead or not lead.is_active:
            active = self._team.active_experts
            if not active:
                return {
                    "status": "failed",
                    "result": None,
                    "phase_results": {},
                    "error": "No active expert available",
                }
            lead = active[0]
            logger.warning(f"Lead expert not available, falling back to '{lead.config.name}'")

        plan = TeamPlan(
            task=task,
            lead_expert=lead.config.name,
            status=PlanStatus.EXECUTING,
        )

        # 1. Emit team_formed event
        # Send experts as IExpertInfo-compatible dicts + plan_phases: [] to match frontend contract
        await self._broadcast_event(
            "team_formed",
            {
                "team_id": self._team.team_id,
                "status": self._team.status.value,
                "lead_expert": lead.config.name,
                "experts": [
                    {
                        "id": e.config.name,
                        "name": e.config.name,
                        "persona": e.config.persona,
                        "avatar": e.config.avatar,
                        "color": e.config.color,
                        "is_lead": e.config.name == lead.config.name,
                        "bound_skills": list(e.config.bound_skills),
                        "status": "active",
                    }
                    for e in self._team.active_experts
                ],
                "plan_phases": [],
            },
        )

        # 2. Set PLANNING status, Lead decomposes task into phases
        self._team.set_status(TeamStatus.PLANNING)
        phases = await self._decompose_task(lead, task)
        if not phases:
            logger.warning("Task decomposition returned no phases, executing as single phase")
            phases = [PlanPhase(name="执行", assigned_expert=lead.config.name, task_description=task)]

        plan.phases = phases[: self.MAX_PHASES]

        # 3. Emit plan_update with phase list
        await self._broadcast_event(
            "plan_update",
            {
                "plan_id": plan.id,
                "plan_phases": [ph.to_dict() for ph in plan.phases],
            },
        )

        # 4. Set EXECUTING status, execute phases
        self._team.set_status(TeamStatus.EXECUTING)
        phase_results: dict[str, dict[str, Any]] = {}

        try:
            # Topological sort phases into execution layers
            layers = plan.topological_sort()

            # Execute layers sequentially, phases within layer in parallel
            for layer in layers:
                # Filter out already-failed phases (from dependency failures)
                ready = [ph for ph in layer if ph.status == PhaseStatus.PENDING]
                if not ready:
                    continue

                # Execute all phases in this layer in parallel
                results = await asyncio.gather(
                    *[self._execute_phase(ph, plan) for ph in ready],
                    return_exceptions=True,
                )

                for ph, result in zip(ready, results):
                    if isinstance(result, (Exception, asyncio.CancelledError)):
                        logger.error(f"Phase {ph.id} ({ph.name}) failed: {result}")
                        plan.update_phase_status(
                            ph.id, PhaseStatus.FAILED, {"error": str(result)}
                        )
                        phase_results[ph.id] = {"error": str(result)}
                        # Emit phase_failed event
                        await self._broadcast_event(
                            "phase_failed",
                            {
                                "phase_id": ph.id,
                                "phase_name": ph.name,
                                "error": str(result),
                            },
                        )
                        # Mark dependent phases as failed
                        await self._mark_dependents_failed(ph.id, plan, phase_results)
                    else:
                        phase_results[ph.id] = result

            # 5. Check if all phases failed
            completed = plan.completed_phases
            if not completed:
                logger.warning("All phases failed, falling back to single agent")
                return await self._fallback_to_single_agent(task, plan, phase_results)

            # 6. Lead Expert synthesizes results (BEST strategy)
            self._team.set_status(TeamStatus.SYNTHESIZING)
            plan.status = PlanStatus.COMPLETED

            final_result = await self._synthesize_results(lead, task, completed)

            self._team.set_status(TeamStatus.COMPLETED)

            # 7. Emit team_synthesis event
            await self._broadcast_event(
                "team_synthesis",
                {
                    "content": final_result.get("content", ""),
                    "phases_completed": len(completed),
                    "phases_total": len(plan.phases),
                },
            )

            # 8. Emit team_dissolved event
            await self._broadcast_event(
                "team_dissolved",
                {"team_id": self._team.team_id},
            )

            return {
                "status": "completed",
                "result": final_result,
                "phase_results": phase_results,
                "plan": plan,
            }

        except ValueError as e:
            # Circular dependency or invalid reference from topological_sort
            logger.error(f"Pipeline execution failed (invalid plan): {e}")
            plan.status = PlanStatus.FAILED
            await self._broadcast_event(
                "team_dissolved", {"team_id": self._team.team_id}
            )
            return await self._fallback_to_single_agent(task, plan, phase_results)
        except Exception as e:
            logger.error(f"Pipeline execution failed: {e}")
            plan.status = PlanStatus.FAILED
            await self._broadcast_event(
                "team_dissolved", {"team_id": self._team.team_id}
            )
            return await self._fallback_to_single_agent(task, plan, phase_results)

    async def _decompose_task(self, lead: Expert, task: str) -> list[PlanPhase]:
        """Lead Expert decomposes task into phases using LLM.

        Returns a list of PlanPhase instances. If LLM decomposition fails,
        returns a single phase with the original task.
        """
        gateway = self._get_llm_gateway(lead)
        if not gateway:
            logger.warning("No LLM gateway available, treating task as single phase")
            return [PlanPhase(name="执行", assigned_expert=lead.config.name, task_description=task)]

        member_names = [
            e.config.name for e in self._team.active_experts if e.config.name != lead.config.name
        ]
        available_experts = member_names if member_names else [lead.config.name]

        prompt = (
            f"You are the Lead Expert in a pipeline team. Decompose the following task into "
            f"at most {self.MAX_PHASES} phases with dependencies.\n\n"
            f"Task: {task}\n\n"
            f"Available experts: {', '.join(available_experts)}\n\n"
            f"Return a JSON array of phase objects, each with:\n"
            f'- "name": phase name (e.g., "规划", "前端", "后端", "QA", "评审")\n'
            f'- "assigned_expert": name of the expert to assign '
            f'(must be one of: {", ".join(available_experts)})\n'
            f'- "task_description": clear phase task description\n'
            f'- "depends_on": array of phase names this phase depends on (empty array if none)\n\n'
            f"Example:\n"
            f'[{{"name":"规划","assigned_expert":"tech_lead",'
            f'"task_description":"设计架构","depends_on":[]}},'
            f'{{"name":"前端","assigned_expert":"frontend",'
            f'"task_description":"实现UI","depends_on":["规划"]}}]\n\n'
            f"Return ONLY the JSON array, no other text."
        )

        try:
            response = await gateway.chat(
                messages=[{"role": "user", "content": prompt}],
                model=self._get_model(lead),
            )
            phases = self._parse_phases(response.content, available_experts, lead.config.name)
            if phases:
                return phases
            logger.warning("LLM decomposition returned no valid phases")
        except Exception as e:
            logger.warning(f"LLM task decomposition failed: {e}")

        return [PlanPhase(name="执行", assigned_expert=lead.config.name, task_description=task)]

    @staticmethod
    def _parse_phases(
        content: str, available_experts: list[str], lead_name: str
    ) -> list[PlanPhase]:
        """Parse LLM response into PlanPhase list.

        Extracts JSON array from the response content and creates PlanPhase instances.
        Resolves depends_on from phase names to phase IDs. Validates assigned_expert
        against available_experts list.
        """
        # Try to extract JSON array from the response
        json_match = re.search(r"\[.*\]", content, re.DOTALL)
        if not json_match:
            return []

        try:
            items = json.loads(json_match.group(0))
        except json.JSONDecodeError:
            return []

        if not isinstance(items, list):
            return []

        # First pass: create phases with IDs, build name->id mapping
        name_to_id: dict[str, str] = {}
        raw_phases: list[dict[str, Any]] = []

        for item in items:
            if not isinstance(item, dict):
                continue
            name = item.get("name", "").strip()
            if not name:
                continue
            assigned = item.get("assigned_expert", "").strip()
            # Validate assigned expert; fall back to lead if invalid
            if assigned not in available_experts:
                assigned = lead_name
            task_desc = item.get("task_description", "").strip() or name
            depends_on_names = item.get("depends_on", [])
            if not isinstance(depends_on_names, list):
                depends_on_names = []

            phase = PlanPhase(
                name=name,
                assigned_expert=assigned,
                task_description=task_desc,
                depends_on=[],  # Will resolve to IDs in second pass
            )
            raw_phases.append({"phase": phase, "depends_on_names": depends_on_names})
            name_to_id[name] = phase.id

        # Second pass: resolve depends_on from names to IDs
        phases: list[PlanPhase] = []
        for entry in raw_phases:
            phase = entry["phase"]
            for dep_name in entry["depends_on_names"]:
                dep_id = name_to_id.get(dep_name)
                if dep_id:
                    phase.depends_on.append(dep_id)
                else:
                    logger.warning(
                        f"Phase '{phase.name}' depends on unknown phase '{dep_name}', ignoring"
                    )
            phases.append(phase)

        return phases

    async def _execute_phase(self, phase: PlanPhase, plan: TeamPlan) -> dict[str, Any]:
        """Execute a single phase using the assigned expert.

        Creates an independent ConfigDrivenAgent instance for context isolation (KTD3).
        Reads dependency outputs from SharedWorkspace, executes the phase task,
        writes the phase output to SharedWorkspace.
        """
        # Resolve the assigned expert
        expert = self._team.get_expert(phase.assigned_expert)
        if not expert or not expert.is_active:
            expert = self._team.lead_expert
            if not expert or not expert.is_active:
                active = self._team.active_experts
                if not active:
                    raise RuntimeError(
                        f"Expert '{phase.assigned_expert}' not available and no active fallback"
                    )
                expert = active[0]
            logger.warning(
                f"Expert '{phase.assigned_expert}' not available, "
                f"falling back to '{expert.config.name}'"
            )
            phase.assigned_expert = expert.config.name

        # Update phase status
        phase.status = PhaseStatus.RUNNING

        # Emit phase_started event
        await self._broadcast_event(
            "phase_started",
            {
                "phase_id": phase.id,
                "phase_name": phase.name,
                "assigned_expert": phase.assigned_expert,
                "depends_on": list(phase.depends_on),
            },
        )

        # Read dependency outputs from in-memory phase results (faster than workspace)
        dependency_outputs: dict[str, Any] = {}
        for dep_id in phase.depends_on:
            dep_phase = plan.get_phase(dep_id)
            if dep_phase and dep_phase.status == PhaseStatus.COMPLETED and dep_phase.result:
                dependency_outputs[dep_phase.name] = dep_phase.result.get(
                    "content", str(dep_phase.result)
                )

        # Emit expert_step event
        await self._broadcast_event(
            "expert_step",
            {
                "expert_id": expert.config.name,
                "expert_name": expert.config.name,
                "expert_color": expert.config.color,
                "content": phase.task_description,
                "step": phase.id,
                "phase_id": phase.id,
                "phase_name": phase.name,
            },
        )

        # Build TaskMessage for execution with context isolation
        # Context includes: task description + persona + dependency outputs
        input_data: dict[str, Any] = {
            "task": phase.task_description,
            "team_id": self._team.team_id,
            "phase_id": phase.id,
            "phase_name": phase.name,
            "is_phase": True,
            "dependency_outputs": dependency_outputs,
        }
        if dependency_outputs:
            input_data["context"] = (
                "前置阶段输出:\n"
                + "\n---\n".join(
                    f"[{name}]:\n{output[:500] if isinstance(output, str) else str(output)[:500]}"
                    for name, output in dependency_outputs.items()
                )
            )

        task_msg = TaskMessage(
            task_id=phase.id,
            agent_name=expert.config.name,
            task_type="team_phase",
            priority=0,
            input_data=input_data,
            callback_url=None,
            created_at=datetime.now(timezone.utc),
        )

        # Execute with context isolation: try creating independent agent via pool
        agent = await self._get_isolated_agent(expert, phase)
        last_error: str | None = None
        result: dict[str, Any] | None = None

        try:
            for attempt in range(self.MAX_RETRIES + 1):
                try:
                    task_result: TaskResult = await agent.execute(task_msg)

                    if task_result.status != TaskStatus.COMPLETED.value:
                        last_error = task_result.error_message or "unknown error"
                        if attempt < self.MAX_RETRIES:
                            logger.info(f"Retrying phase {phase.id} (attempt {attempt + 1})")
                            continue
                        raise RuntimeError(f"Agent execution failed: {last_error}")

                    result = task_result.output_data or {"content": ""}

                    # Update phase status
                    phase.status = PhaseStatus.COMPLETED
                    phase.result = result

                    # Write phase output to SharedWorkspace
                    output_key = f"{plan.id}/phase/{phase.id}/output"
                    await self._team.workspace.write(
                        output_key,
                        result.get("content", str(result)),
                        expert.config.name,
                    )

                    # Emit expert_result event
                    await self._broadcast_event(
                        "expert_result",
                        {
                            "expert_id": expert.config.name,
                            "expert_name": expert.config.name,
                            "expert_color": expert.config.color,
                            "content": result.get("content", str(result)),
                            "phase_id": phase.id,
                        },
                    )

                    # Emit phase_completed event
                    result_summary = result.get("content", str(result))
                    if isinstance(result_summary, str) and len(result_summary) > 200:
                        result_summary = result_summary[:200] + "..."
                    await self._broadcast_event(
                        "phase_completed",
                        {
                            "phase_id": phase.id,
                            "phase_name": phase.name,
                            "result_summary": result_summary,
                        },
                    )

                    return result

                except Exception as e:
                    last_error = str(e)
                    if attempt < self.MAX_RETRIES:
                        logger.info(f"Retrying phase {phase.id} (attempt {attempt + 1})")
                        continue
                    raise

        finally:
            # Clean up isolated agent if we created one
            await self._cleanup_isolated_agent(phase)

        # Should not reach here
        phase.status = PhaseStatus.FAILED
        # Emit phase_failed event
        await self._broadcast_event(
            "phase_failed",
            {
                "phase_id": phase.id,
                "phase_name": phase.name,
                "error": last_error or "unknown error",
            },
        )
        raise RuntimeError(f"Phase {phase.id} ({phase.name}) failed: {last_error}")

    async def _get_isolated_agent(self, expert: Expert, phase: PlanPhase) -> ConfigDrivenAgent:
        """Get an isolated ConfigDrivenAgent instance for the phase.

        If AgentPool is available, creates a temporary agent with a unique name
        for context isolation (KTD3). Otherwise, falls back to the expert's
        existing agent.
        """
        pool = self._team.pool
        if pool is None:
            # No pool available (e.g., in tests), use expert's existing agent
            return expert.agent

        # Create a temporary config with unique name for this phase
        temp_config = copy.deepcopy(expert.config)
        temp_config.name = f"{expert.config.name}__phase_{phase.id[:8]}"

        try:
            agent = await pool.create_agent(temp_config)
            # Track for cleanup
            self._temp_agents[phase.id] = temp_config.name
            return agent
        except Exception as e:
            logger.warning(
                f"Failed to create isolated agent for phase {phase.id}, "
                f"using expert's existing agent: {e}"
            )
            return expert.agent

    async def _cleanup_isolated_agent(self, phase: PlanPhase) -> None:
        """Clean up the temporary isolated agent if one was created."""
        pool = self._team.pool
        if pool is None:
            return

        temp_name = self._temp_agents.pop(phase.id, None)
        if temp_name:
            try:
                await pool.remove_agent(temp_name)
            except Exception as e:
                logger.warning(f"Failed to clean up isolated agent '{temp_name}': {e}")

    async def _mark_dependents_failed(
        self, failed_phase_id: str, plan: TeamPlan, phase_results: dict[str, dict[str, Any]]
    ) -> None:
        """Mark all phases that depend on the failed phase as FAILED."""
        for ph in plan.phases:
            if ph.status != PhaseStatus.PENDING:
                continue
            if failed_phase_id in ph.depends_on:
                ph.status = PhaseStatus.FAILED
                ph.result = {"error": f"Dependency phase '{failed_phase_id}' failed"}
                phase_results[ph.id] = {"error": f"Dependency '{failed_phase_id}' failed"}
                # Emit phase_failed event for cascaded failure
                await self._broadcast_event(
                    "phase_failed",
                    {
                        "phase_id": ph.id,
                        "phase_name": ph.name,
                        "error": f"Dependency phase '{failed_phase_id}' failed",
                    },
                )
                # Recursively mark their dependents
                await self._mark_dependents_failed(ph.id, plan, phase_results)

    async def _synthesize_results(
        self, lead: Expert, task: str, completed_phases: list[PlanPhase]
    ) -> dict[str, Any]:
        """Lead Expert synthesizes results using BEST strategy.

        The Lead Expert evaluates all completed phase results and produces
        a final synthesized result. Uses LLM when available, otherwise
        concatenates results.
        """
        results = [ph.result or {} for ph in completed_phases]
        if not results:
            return {"content": ""}

        # If only one result, return it directly
        if len(results) == 1:
            content = results[0].get("content", str(results[0]))
            return {
                "content": content,
                "strategy": "best",
                "phases_completed": 1,
            }

        gateway = self._get_llm_gateway(lead)
        if not gateway:
            # Without LLM, concatenate all results
            combined = "\n\n".join(
                r.get("content", str(r)) if isinstance(r, dict) else str(r) for r in results
            )
            return {
                "content": combined,
                "strategy": "best",
                "phases_completed": len(results),
            }

        # Build result summaries for LLM evaluation
        summaries = []
        for i, ph in enumerate(completed_phases):
            r = ph.result or {}
            content = r.get("content", str(r)) if isinstance(r, dict) else str(r)
            summaries.append(
                f"Phase {i + 1}: {ph.name} (by {ph.assigned_expert}, task: {ph.task_description[:100]}):\n"
                f"{content[:500]}"
            )

        prompt = (
            f"Original task: {task}\n\n"
            f"Below are {len(results)} phase results from your team members. "
            f"Synthesize them into a single comprehensive final result that "
            f"best addresses the original task.\n\n"
            + "\n---\n".join(summaries)
            + "\n\nProvide the synthesized result directly."
        )

        try:
            response = await gateway.chat(
                messages=[{"role": "user", "content": prompt}],
                model=self._get_model(lead),
            )
            return {
                "content": response.content.strip(),
                "strategy": "best",
                "phases_completed": len(results),
            }
        except Exception as e:
            logger.warning(f"LLM synthesis failed, falling back to concatenation: {e}")
            combined = "\n\n".join(
                r.get("content", str(r)) if isinstance(r, dict) else str(r) for r in results
            )
            return {
                "content": combined,
                "strategy": "best",
                "phases_completed": len(results),
            }

    async def _fallback_to_single_agent(
        self,
        task: str,
        plan: TeamPlan,
        phase_results: dict[str, dict[str, Any]],
    ) -> dict[str, Any]:
        """Fallback to single agent mode when pipeline execution fails.

        Uses the lead expert (or first active expert) to complete the original task.
        """
        plan.status = PlanStatus.FALLBACK
        logger.warning("Falling back to single agent mode")

        expert = self._team.lead_expert
        if not expert or not expert.is_active:
            active = self._team.active_experts
            expert = active[0] if active else None

        fallback_result: dict[str, Any] | None = None
        if expert:
            try:
                task_msg = TaskMessage(
                    task_id=f"fallback_{plan.id}",
                    agent_name=expert.config.name,
                    task_type="fallback",
                    priority=0,
                    input_data={
                        "task": task,
                        "phase_results": phase_results,
                        "team_id": self._team.team_id,
                    },
                    callback_url=None,
                    created_at=datetime.now(timezone.utc),
                )
                task_result: TaskResult = await expert.agent.execute(task_msg)
                fallback_result = task_result.output_data or {
                    "content": f"Task completed by {expert.config.name} (fallback mode)"
                }
            except Exception as e:
                logger.error(f"Fallback agent execution failed: {e}")
                fallback_result = {"error": f"Fallback execution failed: {e}"}
        else:
            fallback_result = {"error": "No active expert available for fallback"}

        return {
            "status": "fallback",
            "result": fallback_result,
            "phase_results": phase_results,
            "plan": plan,
        }

    def _get_model(self, expert: Expert | None = None) -> str:
        """Get LLM model name from expert config.

        Reads expert.config.llm (dict[str, Any] | None) and returns the model
        name. Falls back to "default" if not configured.

        V4 verified: ExpertConfig.llm is dict[str, Any] | None.
        """
        target = expert or self._team.lead_expert
        if target and target.config.llm:
            return target.config.llm.get("model", "default")
        return "default"

    def _get_llm_gateway(self, expert: Expert | None = None) -> LLMGateway | None:
        """Get LLM gateway from the given expert or the lead expert's agent.

        Falls back to other active experts if the primary target has no gateway.
        """
        target = expert or self._team.lead_expert
        if target and hasattr(target, "agent") and hasattr(target.agent, "_llm_gateway"):
            gateway = target.agent._llm_gateway
            if gateway is not None:
                return gateway
        # Fallback: try first active expert with a gateway
        for exp in self._team.active_experts:
            if hasattr(exp, "agent") and hasattr(exp.agent, "_llm_gateway"):
                gateway = exp.agent._llm_gateway
                if gateway is not None:
                    return gateway
        return None

    async def _broadcast_event(self, event_type: str, data: dict[str, Any]) -> None:
        """Broadcast an orchestration event to the team channel.

        Events are emitted via handoff_transport for WebSocket relay.
        Supported event types: team_formed, expert_step, expert_result,
        plan_update, phase_started, phase_completed, phase_failed,
        team_synthesis, team_dissolved.
        """
        if self._team.handoff_transport:
            try:
                await self._team.handoff_transport.send(
                    self._team.team_channel, {"type": event_type, **data}
                )
            except Exception as e:
                logger.warning(f"Failed to broadcast event '{event_type}': {e}")