fischer-agentkit/src/agentkit/experts/orchestrator.py

758 lines
30 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""TeamOrchestrator - 流水线模式专家团队执行引擎
驱动 ExpertTeam 在流水线模式下执行任务:
1. Lead Expert 接收任务分解为阶段PlanPhase阶段间有依赖关系depends_on
2. 按依赖拓扑排序同层无依赖阶段并行asyncio.gather层间串行
3. 每个阶段创建独立 ConfigDrivenAgent 实例上下文隔离KTD3
4. 阶段间数据通过 SharedWorkspace 传递({task_id}/phase/{phase_id}/output
5. Lead Expert 汇总所有阶段结果BEST 策略)
6. 返回最终结果
生命周期FORMING → PLANNING → EXECUTING → SYNTHESIZING → COMPLETED
设计依据:
- KTD2: Lead 分解为阶段而非子任务,支持流水线串行阶段
- KTD3: 上下文隔离,独立 ConfigDrivenAgent 实例
- KTD6: PLANNING 状态在分解阶段设置
"""
from __future__ import annotations
import asyncio
import copy
import json
import logging
import re
from datetime import datetime, timezone
from typing import Any
from agentkit.core.config_driven import ConfigDrivenAgent
from agentkit.core.protocol import TaskMessage, TaskResult, TaskStatus
from agentkit.llm.gateway import LLMGateway
from .expert import Expert
from .plan import PhaseStatus, PlanPhase, PlanStatus, TeamPlan
from .team import ExpertTeam, TeamStatus
logger = logging.getLogger(__name__)
class TeamOrchestrator:
"""Pipeline orchestration engine.
Lead Expert decomposes the task into phases with dependencies (depends_on).
Phases are executed in topological order: same-layer phases run in parallel
(asyncio.gather), layers run sequentially. Each phase gets an independent
ConfigDrivenAgent instance for context isolation (KTD3).
"""
MAX_PHASES = 10 # Maximum phases Lead Expert can decompose
MAX_RETRIES = 1 # Retry once on phase failure before marking failed
def __init__(self, team: ExpertTeam) -> None:
self._team = team
# Track temporary agent names created for context isolation (KTD3)
# Maps phase_id -> temp_agent_name for cleanup
self._temp_agents: dict[str, str] = {}
async def execute(self, task: str) -> dict[str, Any]:
"""Execute a task in pipeline mode.
Flow:
1. Emit team_formed event
2. Set PLANNING status, Lead Expert decomposes task into phases
3. Emit plan_update with phase list
4. Set EXECUTING status, topological sort, execute layers:
- Same-layer phases parallel (asyncio.gather)
- Layer-by-layer sequential
5. Set SYNTHESIZING status, Lead synthesizes results (BEST strategy)
6. Set COMPLETED status, emit team_synthesis event
Returns a dict with:
- "status": "completed" | "failed" | "fallback"
- "result": final synthesized result
- "phase_results": dict of phase_id -> result
- "plan": TeamPlan instance
"""
lead = self._team.lead_expert
if not lead or not lead.is_active:
active = self._team.active_experts
if not active:
return {
"status": "failed",
"result": None,
"phase_results": {},
"error": "No active expert available",
}
lead = active[0]
logger.warning(f"Lead expert not available, falling back to '{lead.config.name}'")
plan = TeamPlan(
task=task,
lead_expert=lead.config.name,
status=PlanStatus.EXECUTING,
)
# 1. Emit team_formed event
# Send experts as IExpertInfo-compatible dicts + plan_phases: [] to match frontend contract
await self._broadcast_event(
"team_formed",
{
"team_id": self._team.team_id,
"status": self._team.status.value,
"lead_expert": lead.config.name,
"experts": [
{
"id": e.config.name,
"name": e.config.name,
"persona": e.config.persona,
"avatar": e.config.avatar,
"color": e.config.color,
"is_lead": e.config.name == lead.config.name,
"bound_skills": list(e.config.bound_skills),
"status": "active",
}
for e in self._team.active_experts
],
"plan_phases": [],
},
)
# 2. Set PLANNING status, Lead decomposes task into phases
self._team.set_status(TeamStatus.PLANNING)
phases = await self._decompose_task(lead, task)
if not phases:
logger.warning("Task decomposition returned no phases, executing as single phase")
phases = [PlanPhase(name="执行", assigned_expert=lead.config.name, task_description=task)]
plan.phases = phases[: self.MAX_PHASES]
# 3. Emit plan_update with phase list
await self._broadcast_event(
"plan_update",
{
"plan_id": plan.id,
"plan_phases": [ph.to_dict() for ph in plan.phases],
},
)
# 4. Set EXECUTING status, execute phases
self._team.set_status(TeamStatus.EXECUTING)
phase_results: dict[str, dict[str, Any]] = {}
try:
# Topological sort phases into execution layers
layers = plan.topological_sort()
# Execute layers sequentially, phases within layer in parallel
for layer in layers:
# Filter out already-failed phases (from dependency failures)
ready = [ph for ph in layer if ph.status == PhaseStatus.PENDING]
if not ready:
continue
# Execute all phases in this layer in parallel
results = await asyncio.gather(
*[self._execute_phase(ph, plan) for ph in ready],
return_exceptions=True,
)
for ph, result in zip(ready, results):
if isinstance(result, (Exception, asyncio.CancelledError)):
logger.error(f"Phase {ph.id} ({ph.name}) failed: {result}")
plan.update_phase_status(
ph.id, PhaseStatus.FAILED, {"error": str(result)}
)
phase_results[ph.id] = {"error": str(result)}
# Emit phase_failed event
await self._broadcast_event(
"phase_failed",
{
"phase_id": ph.id,
"phase_name": ph.name,
"error": str(result),
},
)
# Mark dependent phases as failed
await self._mark_dependents_failed(ph.id, plan, phase_results)
else:
phase_results[ph.id] = result
# 5. Check if all phases failed
completed = plan.completed_phases
if not completed:
logger.warning("All phases failed, falling back to single agent")
return await self._fallback_to_single_agent(task, plan, phase_results)
# 6. Lead Expert synthesizes results (BEST strategy)
self._team.set_status(TeamStatus.SYNTHESIZING)
plan.status = PlanStatus.COMPLETED
final_result = await self._synthesize_results(lead, task, completed)
self._team.set_status(TeamStatus.COMPLETED)
# 7. Emit team_synthesis event
await self._broadcast_event(
"team_synthesis",
{
"content": final_result.get("content", ""),
"phases_completed": len(completed),
"phases_total": len(plan.phases),
},
)
# 8. Emit team_dissolved event
await self._broadcast_event(
"team_dissolved",
{"team_id": self._team.team_id},
)
return {
"status": "completed",
"result": final_result,
"phase_results": phase_results,
"plan": plan,
}
except ValueError as e:
# Circular dependency or invalid reference from topological_sort
logger.error(f"Pipeline execution failed (invalid plan): {e}")
plan.status = PlanStatus.FAILED
await self._broadcast_event(
"team_dissolved", {"team_id": self._team.team_id}
)
return await self._fallback_to_single_agent(task, plan, phase_results)
except Exception as e:
logger.error(f"Pipeline execution failed: {e}")
plan.status = PlanStatus.FAILED
await self._broadcast_event(
"team_dissolved", {"team_id": self._team.team_id}
)
return await self._fallback_to_single_agent(task, plan, phase_results)
async def _decompose_task(self, lead: Expert, task: str) -> list[PlanPhase]:
"""Lead Expert decomposes task into phases using LLM.
Returns a list of PlanPhase instances. If LLM decomposition fails,
returns a single phase with the original task.
"""
gateway = self._get_llm_gateway(lead)
if not gateway:
logger.warning("No LLM gateway available, treating task as single phase")
return [PlanPhase(name="执行", assigned_expert=lead.config.name, task_description=task)]
member_names = [
e.config.name for e in self._team.active_experts if e.config.name != lead.config.name
]
available_experts = member_names if member_names else [lead.config.name]
prompt = (
f"You are the Lead Expert in a pipeline team. Decompose the following task into "
f"at most {self.MAX_PHASES} phases with dependencies.\n\n"
f"Task: {task}\n\n"
f"Available experts: {', '.join(available_experts)}\n\n"
f"Return a JSON array of phase objects, each with:\n"
f'- "name": phase name (e.g., "规划", "前端", "后端", "QA", "评审")\n'
f'- "assigned_expert": name of the expert to assign '
f'(must be one of: {", ".join(available_experts)})\n'
f'- "task_description": clear phase task description\n'
f'- "depends_on": array of phase names this phase depends on (empty array if none)\n\n'
f"Example:\n"
f'[{{"name":"规划","assigned_expert":"tech_lead",'
f'"task_description":"设计架构","depends_on":[]}},'
f'{{"name":"前端","assigned_expert":"frontend",'
f'"task_description":"实现UI","depends_on":["规划"]}}]\n\n'
f"Return ONLY the JSON array, no other text."
)
try:
response = await gateway.chat(
messages=[{"role": "user", "content": prompt}],
model=self._get_model(lead),
)
phases = self._parse_phases(response.content, available_experts, lead.config.name)
if phases:
return phases
logger.warning("LLM decomposition returned no valid phases")
except Exception as e:
logger.warning(f"LLM task decomposition failed: {e}")
return [PlanPhase(name="执行", assigned_expert=lead.config.name, task_description=task)]
@staticmethod
def _parse_phases(
content: str, available_experts: list[str], lead_name: str
) -> list[PlanPhase]:
"""Parse LLM response into PlanPhase list.
Extracts JSON array from the response content and creates PlanPhase instances.
Resolves depends_on from phase names to phase IDs. Validates assigned_expert
against available_experts list.
"""
# Try to extract JSON array from the response
json_match = re.search(r"\[.*\]", content, re.DOTALL)
if not json_match:
return []
try:
items = json.loads(json_match.group(0))
except json.JSONDecodeError:
return []
if not isinstance(items, list):
return []
# First pass: create phases with IDs, build name->id mapping
name_to_id: dict[str, str] = {}
raw_phases: list[dict[str, Any]] = []
for item in items:
if not isinstance(item, dict):
continue
name = item.get("name", "").strip()
if not name:
continue
assigned = item.get("assigned_expert", "").strip()
# Validate assigned expert; fall back to lead if invalid
if assigned not in available_experts:
assigned = lead_name
task_desc = item.get("task_description", "").strip() or name
depends_on_names = item.get("depends_on", [])
if not isinstance(depends_on_names, list):
depends_on_names = []
phase = PlanPhase(
name=name,
assigned_expert=assigned,
task_description=task_desc,
depends_on=[], # Will resolve to IDs in second pass
)
raw_phases.append({"phase": phase, "depends_on_names": depends_on_names})
name_to_id[name] = phase.id
# Second pass: resolve depends_on from names to IDs
phases: list[PlanPhase] = []
for entry in raw_phases:
phase = entry["phase"]
for dep_name in entry["depends_on_names"]:
dep_id = name_to_id.get(dep_name)
if dep_id:
phase.depends_on.append(dep_id)
else:
logger.warning(
f"Phase '{phase.name}' depends on unknown phase '{dep_name}', ignoring"
)
phases.append(phase)
return phases
async def _execute_phase(self, phase: PlanPhase, plan: TeamPlan) -> dict[str, Any]:
"""Execute a single phase using the assigned expert.
Creates an independent ConfigDrivenAgent instance for context isolation (KTD3).
Reads dependency outputs from SharedWorkspace, executes the phase task,
writes the phase output to SharedWorkspace.
"""
# Resolve the assigned expert
expert = self._team.get_expert(phase.assigned_expert)
if not expert or not expert.is_active:
expert = self._team.lead_expert
if not expert or not expert.is_active:
active = self._team.active_experts
if not active:
raise RuntimeError(
f"Expert '{phase.assigned_expert}' not available and no active fallback"
)
expert = active[0]
logger.warning(
f"Expert '{phase.assigned_expert}' not available, "
f"falling back to '{expert.config.name}'"
)
phase.assigned_expert = expert.config.name
# Update phase status
phase.status = PhaseStatus.RUNNING
# Emit phase_started event
await self._broadcast_event(
"phase_started",
{
"phase_id": phase.id,
"phase_name": phase.name,
"assigned_expert": phase.assigned_expert,
"depends_on": list(phase.depends_on),
},
)
# Read dependency outputs from in-memory phase results (faster than workspace)
dependency_outputs: dict[str, Any] = {}
for dep_id in phase.depends_on:
dep_phase = plan.get_phase(dep_id)
if dep_phase and dep_phase.status == PhaseStatus.COMPLETED and dep_phase.result:
dependency_outputs[dep_phase.name] = dep_phase.result.get(
"content", str(dep_phase.result)
)
# Emit expert_step event
await self._broadcast_event(
"expert_step",
{
"expert_id": expert.config.name,
"expert_name": expert.config.name,
"expert_color": expert.config.color,
"content": phase.task_description,
"step": phase.id,
"phase_id": phase.id,
"phase_name": phase.name,
},
)
# Build TaskMessage for execution with context isolation
# Context includes: task description + persona + dependency outputs
input_data: dict[str, Any] = {
"task": phase.task_description,
"team_id": self._team.team_id,
"phase_id": phase.id,
"phase_name": phase.name,
"is_phase": True,
"dependency_outputs": dependency_outputs,
}
if dependency_outputs:
input_data["context"] = (
"前置阶段输出:\n"
+ "\n---\n".join(
f"[{name}]:\n{output[:500] if isinstance(output, str) else str(output)[:500]}"
for name, output in dependency_outputs.items()
)
)
task_msg = TaskMessage(
task_id=phase.id,
agent_name=expert.config.name,
task_type="team_phase",
priority=0,
input_data=input_data,
callback_url=None,
created_at=datetime.now(timezone.utc),
)
# Execute with context isolation: try creating independent agent via pool
agent = await self._get_isolated_agent(expert, phase)
last_error: str | None = None
result: dict[str, Any] | None = None
try:
for attempt in range(self.MAX_RETRIES + 1):
try:
task_result: TaskResult = await agent.execute(task_msg)
if task_result.status != TaskStatus.COMPLETED.value:
last_error = task_result.error_message or "unknown error"
if attempt < self.MAX_RETRIES:
logger.info(f"Retrying phase {phase.id} (attempt {attempt + 1})")
continue
raise RuntimeError(f"Agent execution failed: {last_error}")
result = task_result.output_data or {"content": ""}
# Update phase status
phase.status = PhaseStatus.COMPLETED
phase.result = result
# Write phase output to SharedWorkspace
output_key = f"{plan.id}/phase/{phase.id}/output"
await self._team.workspace.write(
output_key,
result.get("content", str(result)),
expert.config.name,
)
# Emit expert_result event
await self._broadcast_event(
"expert_result",
{
"expert_id": expert.config.name,
"expert_name": expert.config.name,
"expert_color": expert.config.color,
"content": result.get("content", str(result)),
"phase_id": phase.id,
},
)
# Emit phase_completed event
result_summary = result.get("content", str(result))
if isinstance(result_summary, str) and len(result_summary) > 200:
result_summary = result_summary[:200] + "..."
await self._broadcast_event(
"phase_completed",
{
"phase_id": phase.id,
"phase_name": phase.name,
"result_summary": result_summary,
},
)
return result
except Exception as e:
last_error = str(e)
if attempt < self.MAX_RETRIES:
logger.info(f"Retrying phase {phase.id} (attempt {attempt + 1})")
continue
raise
finally:
# Clean up isolated agent if we created one
await self._cleanup_isolated_agent(phase)
# Should not reach here
phase.status = PhaseStatus.FAILED
# Emit phase_failed event
await self._broadcast_event(
"phase_failed",
{
"phase_id": phase.id,
"phase_name": phase.name,
"error": last_error or "unknown error",
},
)
raise RuntimeError(f"Phase {phase.id} ({phase.name}) failed: {last_error}")
async def _get_isolated_agent(self, expert: Expert, phase: PlanPhase) -> ConfigDrivenAgent:
"""Get an isolated ConfigDrivenAgent instance for the phase.
If AgentPool is available, creates a temporary agent with a unique name
for context isolation (KTD3). Otherwise, falls back to the expert's
existing agent.
"""
pool = self._team.pool
if pool is None:
# No pool available (e.g., in tests), use expert's existing agent
return expert.agent
# Create a temporary config with unique name for this phase
temp_config = copy.deepcopy(expert.config)
temp_config.name = f"{expert.config.name}__phase_{phase.id[:8]}"
try:
agent = await pool.create_agent(temp_config)
# Track for cleanup
self._temp_agents[phase.id] = temp_config.name
return agent
except Exception as e:
logger.warning(
f"Failed to create isolated agent for phase {phase.id}, "
f"using expert's existing agent: {e}"
)
return expert.agent
async def _cleanup_isolated_agent(self, phase: PlanPhase) -> None:
"""Clean up the temporary isolated agent if one was created."""
pool = self._team.pool
if pool is None:
return
temp_name = self._temp_agents.pop(phase.id, None)
if temp_name:
try:
await pool.remove_agent(temp_name)
except Exception as e:
logger.warning(f"Failed to clean up isolated agent '{temp_name}': {e}")
async def _mark_dependents_failed(
self, failed_phase_id: str, plan: TeamPlan, phase_results: dict[str, dict[str, Any]]
) -> None:
"""Mark all phases that depend on the failed phase as FAILED."""
for ph in plan.phases:
if ph.status != PhaseStatus.PENDING:
continue
if failed_phase_id in ph.depends_on:
ph.status = PhaseStatus.FAILED
ph.result = {"error": f"Dependency phase '{failed_phase_id}' failed"}
phase_results[ph.id] = {"error": f"Dependency '{failed_phase_id}' failed"}
# Emit phase_failed event for cascaded failure
await self._broadcast_event(
"phase_failed",
{
"phase_id": ph.id,
"phase_name": ph.name,
"error": f"Dependency phase '{failed_phase_id}' failed",
},
)
# Recursively mark their dependents
await self._mark_dependents_failed(ph.id, plan, phase_results)
async def _synthesize_results(
self, lead: Expert, task: str, completed_phases: list[PlanPhase]
) -> dict[str, Any]:
"""Lead Expert synthesizes results using BEST strategy.
The Lead Expert evaluates all completed phase results and produces
a final synthesized result. Uses LLM when available, otherwise
concatenates results.
"""
results = [ph.result or {} for ph in completed_phases]
if not results:
return {"content": ""}
# If only one result, return it directly
if len(results) == 1:
content = results[0].get("content", str(results[0]))
return {
"content": content,
"strategy": "best",
"phases_completed": 1,
}
gateway = self._get_llm_gateway(lead)
if not gateway:
# Without LLM, concatenate all results
combined = "\n\n".join(
r.get("content", str(r)) if isinstance(r, dict) else str(r) for r in results
)
return {
"content": combined,
"strategy": "best",
"phases_completed": len(results),
}
# Build result summaries for LLM evaluation
summaries = []
for i, ph in enumerate(completed_phases):
r = ph.result or {}
content = r.get("content", str(r)) if isinstance(r, dict) else str(r)
summaries.append(
f"Phase {i + 1}: {ph.name} (by {ph.assigned_expert}, task: {ph.task_description[:100]}):\n"
f"{content[:500]}"
)
prompt = (
f"Original task: {task}\n\n"
f"Below are {len(results)} phase results from your team members. "
f"Synthesize them into a single comprehensive final result that "
f"best addresses the original task.\n\n"
+ "\n---\n".join(summaries)
+ "\n\nProvide the synthesized result directly."
)
try:
response = await gateway.chat(
messages=[{"role": "user", "content": prompt}],
model=self._get_model(lead),
)
return {
"content": response.content.strip(),
"strategy": "best",
"phases_completed": len(results),
}
except Exception as e:
logger.warning(f"LLM synthesis failed, falling back to concatenation: {e}")
combined = "\n\n".join(
r.get("content", str(r)) if isinstance(r, dict) else str(r) for r in results
)
return {
"content": combined,
"strategy": "best",
"phases_completed": len(results),
}
async def _fallback_to_single_agent(
self,
task: str,
plan: TeamPlan,
phase_results: dict[str, dict[str, Any]],
) -> dict[str, Any]:
"""Fallback to single agent mode when pipeline execution fails.
Uses the lead expert (or first active expert) to complete the original task.
"""
plan.status = PlanStatus.FALLBACK
logger.warning("Falling back to single agent mode")
expert = self._team.lead_expert
if not expert or not expert.is_active:
active = self._team.active_experts
expert = active[0] if active else None
fallback_result: dict[str, Any] | None = None
if expert:
try:
task_msg = TaskMessage(
task_id=f"fallback_{plan.id}",
agent_name=expert.config.name,
task_type="fallback",
priority=0,
input_data={
"task": task,
"phase_results": phase_results,
"team_id": self._team.team_id,
},
callback_url=None,
created_at=datetime.now(timezone.utc),
)
task_result: TaskResult = await expert.agent.execute(task_msg)
fallback_result = task_result.output_data or {
"content": f"Task completed by {expert.config.name} (fallback mode)"
}
except Exception as e:
logger.error(f"Fallback agent execution failed: {e}")
fallback_result = {"error": f"Fallback execution failed: {e}"}
else:
fallback_result = {"error": "No active expert available for fallback"}
return {
"status": "fallback",
"result": fallback_result,
"phase_results": phase_results,
"plan": plan,
}
def _get_model(self, expert: Expert | None = None) -> str:
"""Get LLM model name from expert config.
Reads expert.config.llm (dict[str, Any] | None) and returns the model
name. Falls back to "default" if not configured.
V4 verified: ExpertConfig.llm is dict[str, Any] | None.
"""
target = expert or self._team.lead_expert
if target and target.config.llm:
return target.config.llm.get("model", "default")
return "default"
def _get_llm_gateway(self, expert: Expert | None = None) -> LLMGateway | None:
"""Get LLM gateway from the given expert or the lead expert's agent.
Falls back to other active experts if the primary target has no gateway.
"""
target = expert or self._team.lead_expert
if target and hasattr(target, "agent") and hasattr(target.agent, "_llm_gateway"):
gateway = target.agent._llm_gateway
if gateway is not None:
return gateway
# Fallback: try first active expert with a gateway
for exp in self._team.active_experts:
if hasattr(exp, "agent") and hasattr(exp.agent, "_llm_gateway"):
gateway = exp.agent._llm_gateway
if gateway is not None:
return gateway
return None
async def _broadcast_event(self, event_type: str, data: dict[str, Any]) -> None:
"""Broadcast an orchestration event to the team channel.
Events are emitted via handoff_transport for WebSocket relay.
Supported event types: team_formed, expert_step, expert_result,
plan_update, phase_started, phase_completed, phase_failed,
team_synthesis, team_dissolved.
"""
if self._team.handoff_transport:
try:
await self._team.handoff_transport.send(
self._team.team_channel, {"type": event_type, **data}
)
except Exception as e:
logger.warning(f"Failed to broadcast event '{event_type}': {e}")