546 lines
18 KiB
Python
546 lines
18 KiB
Python
"""Pipeline执行引擎 - 编排多阶段Agent任务链的执行"""
|
||
|
||
import asyncio
|
||
import logging
|
||
import time
|
||
import uuid
|
||
from datetime import datetime, timezone
|
||
from typing import Any, Optional
|
||
|
||
from .loader import PipelineLoader
|
||
from .schema import (
|
||
Pipeline,
|
||
PipelineResult,
|
||
PipelineStage,
|
||
StageResult,
|
||
StageStatus,
|
||
)
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
class PipelineExecutionError(Exception):
|
||
"""Pipeline执行错误"""
|
||
|
||
def __init__(self, pipeline_name: str, reason: str = ""):
|
||
self.pipeline_name = pipeline_name
|
||
self.reason = reason
|
||
super().__init__(f"Pipeline '{pipeline_name}' execution error: {reason}")
|
||
|
||
|
||
class PipelineEngine:
|
||
"""
|
||
Pipeline执行引擎。
|
||
|
||
负责按照YAML定义的依赖关系(DAG)编排多阶段Agent任务的执行,
|
||
支持变量传递、超时控制、重试、条件执行等。
|
||
|
||
如果dispatcher为None,则使用模拟执行(dry-run模式),
|
||
用于测试和调试Pipeline定义。
|
||
"""
|
||
|
||
def __init__(self, dispatcher=None):
|
||
"""
|
||
Args:
|
||
dispatcher: TaskDispatcher实例(可选)。
|
||
如果为None,使用模拟执行(dry-run模式)。
|
||
"""
|
||
self.dispatcher = dispatcher
|
||
self.loader = PipelineLoader()
|
||
|
||
async def execute(
|
||
self,
|
||
pipeline: Pipeline,
|
||
context: dict | None = None,
|
||
) -> PipelineResult:
|
||
"""
|
||
执行整个Pipeline。
|
||
|
||
执行流程:
|
||
1. 合并全局变量和传入context
|
||
2. 拓扑排序确定执行顺序
|
||
3. 按依赖顺序逐stage执行
|
||
4. 变量解析:将${...}替换为实际值
|
||
5. 调用dispatcher分发任务给对应agent
|
||
6. 收集结果,传递给下游stage
|
||
|
||
Args:
|
||
pipeline: Pipeline定义对象
|
||
context: 外部传入的上下文变量(覆盖pipeline.variables中的默认值)
|
||
|
||
Returns:
|
||
PipelineResult包含所有阶段的执行结果
|
||
"""
|
||
start_time = time.monotonic()
|
||
pipeline_name = pipeline.name
|
||
|
||
# 构建执行上下文:合并全局变量和外部context
|
||
exec_context: dict[str, Any] = dict(pipeline.variables)
|
||
if context:
|
||
exec_context.update(context)
|
||
|
||
# 初始化stages结果存储
|
||
stages_results: dict[str, StageResult] = {}
|
||
|
||
# 构建stages上下文(用于变量解析)
|
||
stages_context: dict[str, Any] = {}
|
||
|
||
# 拓扑排序
|
||
try:
|
||
sorted_stages = self._topological_sort(pipeline.stages)
|
||
except ValueError as e:
|
||
duration_ms = int((time.monotonic() - start_time) * 1000)
|
||
return PipelineResult(
|
||
pipeline_name=pipeline_name,
|
||
status=StageStatus.FAILED,
|
||
stages_results=stages_results,
|
||
duration_ms=duration_ms,
|
||
error=str(e),
|
||
)
|
||
|
||
logger.info(
|
||
f"Pipeline '{pipeline_name}' starting execution "
|
||
f"({len(sorted_stages)} stages, order: {[s.name for s in sorted_stages]})"
|
||
)
|
||
|
||
# 逐阶段执行
|
||
overall_status = StageStatus.COMPLETED
|
||
failed_stages: set[str] = set()
|
||
skipped_stages: set[str] = set()
|
||
|
||
for stage in sorted_stages:
|
||
# 检查是否应该跳过此阶段
|
||
if self._should_skip(stage, failed_stages, skipped_stages):
|
||
skipped_stages.add(stage.name)
|
||
stages_results[stage.name] = StageResult(
|
||
stage_name=stage.name,
|
||
status=StageStatus.SKIPPED,
|
||
)
|
||
# 将跳过阶段的信息写入stages_context
|
||
stages_context[stage.name] = {"outputs": {}, "status": StageStatus.SKIPPED}
|
||
logger.info(f"Stage '{stage.name}' skipped (dependency failed)")
|
||
continue
|
||
|
||
# 检查条件表达式
|
||
if stage.condition and not self._evaluate_condition(stage.condition, exec_context, stages_context):
|
||
skipped_stages.add(stage.name)
|
||
stages_results[stage.name] = StageResult(
|
||
stage_name=stage.name,
|
||
status=StageStatus.SKIPPED,
|
||
)
|
||
stages_context[stage.name] = {"outputs": {}, "status": StageStatus.SKIPPED}
|
||
logger.info(f"Stage '{stage.name}' skipped (condition not met: {stage.condition})")
|
||
continue
|
||
|
||
# 执行阶段
|
||
stage_result = await self._execute_stage(stage, exec_context, stages_context)
|
||
stages_results[stage.name] = stage_result
|
||
|
||
# 将结果写入stages_context供下游引用
|
||
stages_context[stage.name] = {
|
||
"outputs": stage_result.outputs,
|
||
"status": stage_result.status,
|
||
}
|
||
|
||
if stage_result.status == StageStatus.FAILED:
|
||
if stage.continue_on_failure:
|
||
logger.warning(
|
||
f"Stage '{stage.name}' failed but continue_on_failure=True"
|
||
)
|
||
else:
|
||
failed_stages.add(stage.name)
|
||
overall_status = StageStatus.FAILED
|
||
logger.error(
|
||
f"Stage '{stage.name}' failed: {stage_result.error}"
|
||
)
|
||
|
||
logger.info(
|
||
f"Stage '{stage.name}' {stage_result.status.value} "
|
||
f"({stage_result.duration_ms}ms)"
|
||
)
|
||
|
||
duration_ms = int((time.monotonic() - start_time) * 1000)
|
||
|
||
result = PipelineResult(
|
||
pipeline_name=pipeline_name,
|
||
status=overall_status,
|
||
stages_results=stages_results,
|
||
duration_ms=duration_ms,
|
||
)
|
||
|
||
logger.info(
|
||
f"Pipeline '{pipeline_name}' finished with status={overall_status.value} "
|
||
f"({duration_ms}ms)"
|
||
)
|
||
|
||
return result
|
||
|
||
async def _execute_stage(
|
||
self,
|
||
stage: PipelineStage,
|
||
exec_context: dict[str, Any],
|
||
stages_context: dict[str, Any],
|
||
) -> StageResult:
|
||
"""
|
||
执行单个Stage。
|
||
|
||
1. 解析inputs中的变量引用
|
||
2. 通过dispatcher向agent发送任务
|
||
3. 等待结果(或超时)
|
||
4. 如果dispatcher为None,返回模拟结果(dry-run模式)
|
||
|
||
Args:
|
||
stage: 阶段定义
|
||
exec_context: 全局执行上下文
|
||
stages_context: 已完成阶段的结果上下文
|
||
|
||
Returns:
|
||
StageResult
|
||
"""
|
||
start_time = time.monotonic()
|
||
|
||
# 构建变量解析的完整上下文
|
||
resolve_ctx = dict(exec_context)
|
||
resolve_ctx["stages"] = stages_context
|
||
|
||
# 解析inputs中的变量引用
|
||
resolved_inputs = self._resolve_stage_inputs(stage.inputs, resolve_ctx)
|
||
|
||
logger.info(
|
||
f"Stage '{stage.name}' starting "
|
||
f"(agent={stage.agent}, action={stage.action})"
|
||
)
|
||
|
||
# 执行(带重试)
|
||
last_error: str | None = None
|
||
for attempt in range(stage.retry_count + 1):
|
||
if attempt > 0:
|
||
logger.info(
|
||
f"Stage '{stage.name}' retry attempt {attempt}/{stage.retry_count}"
|
||
)
|
||
|
||
try:
|
||
if self.dispatcher is not None:
|
||
# 实际执行:通过dispatcher分发任务
|
||
result = await self._dispatch_and_wait(stage, resolved_inputs)
|
||
else:
|
||
# Dry-run模式:返回模拟结果
|
||
result = self._dry_run_stage(stage, resolved_inputs)
|
||
|
||
# 成功
|
||
duration_ms = int((time.monotonic() - start_time) * 1000)
|
||
result.duration_ms = duration_ms
|
||
return result
|
||
|
||
except asyncio.TimeoutError:
|
||
last_error = f"Stage timed out after {stage.timeout_seconds}s"
|
||
logger.warning(
|
||
f"Stage '{stage.name}' timeout (attempt {attempt + 1}/{stage.retry_count + 1})"
|
||
)
|
||
|
||
except Exception as e:
|
||
last_error = str(e)
|
||
logger.warning(
|
||
f"Stage '{stage.name}' error (attempt {attempt + 1}/{stage.retry_count + 1}): {e}"
|
||
)
|
||
|
||
# 所有重试都失败
|
||
duration_ms = int((time.monotonic() - start_time) * 1000)
|
||
return StageResult(
|
||
stage_name=stage.name,
|
||
status=StageStatus.FAILED,
|
||
error=last_error,
|
||
duration_ms=duration_ms,
|
||
)
|
||
|
||
async def _dispatch_and_wait(
|
||
self,
|
||
stage: PipelineStage,
|
||
resolved_inputs: dict[str, Any],
|
||
) -> StageResult:
|
||
"""
|
||
通过TaskDispatcher分发任务并等待结果。
|
||
|
||
Args:
|
||
stage: 阶段定义
|
||
resolved_inputs: 已解析的输入参数
|
||
|
||
Returns:
|
||
StageResult
|
||
|
||
Raises:
|
||
asyncio.TimeoutError: 超时
|
||
Exception: 分发或执行错误
|
||
"""
|
||
from app.agent_framework.protocol import TaskMessage, TaskResult, TaskStatus
|
||
|
||
task_id = str(uuid.uuid4())
|
||
task_message = TaskMessage(
|
||
task_id=task_id,
|
||
agent_name=stage.agent,
|
||
task_type=stage.action,
|
||
priority=0,
|
||
input_data=resolved_inputs,
|
||
callback_url=None,
|
||
created_at=datetime.now(timezone.utc),
|
||
timeout_seconds=stage.timeout_seconds,
|
||
)
|
||
|
||
# 分发任务
|
||
dispatched_id = await self.dispatcher.dispatch(task_message)
|
||
|
||
# 等待结果(轮询方式)
|
||
elapsed = 0.0
|
||
poll_interval = 1.0 # 1秒轮询一次
|
||
while elapsed < stage.timeout_seconds:
|
||
await asyncio.sleep(poll_interval)
|
||
elapsed += poll_interval
|
||
|
||
task_status = await self.dispatcher.get_task_status(dispatched_id)
|
||
|
||
if task_status.get("status") == TaskStatus.COMPLETED:
|
||
output_data = task_status.get("output_data", {})
|
||
# 构建输出:只提取stage.outputs中声明的变量
|
||
outputs = self._extract_outputs(stage, output_data)
|
||
return StageResult(
|
||
stage_name=stage.name,
|
||
status=StageStatus.COMPLETED,
|
||
outputs=outputs,
|
||
)
|
||
|
||
elif task_status.get("status") == TaskStatus.FAILED:
|
||
error_msg = task_status.get("error_message", "Unknown error")
|
||
return StageResult(
|
||
stage_name=stage.name,
|
||
status=StageStatus.FAILED,
|
||
error=error_msg,
|
||
)
|
||
|
||
elif task_status.get("status") == TaskStatus.CANCELLED:
|
||
return StageResult(
|
||
stage_name=stage.name,
|
||
status=StageStatus.FAILED,
|
||
error="Task was cancelled",
|
||
)
|
||
|
||
# 超时
|
||
raise asyncio.TimeoutError()
|
||
|
||
def _dry_run_stage(
|
||
self,
|
||
stage: PipelineStage,
|
||
resolved_inputs: dict[str, Any],
|
||
) -> StageResult:
|
||
"""
|
||
Dry-run模式执行:模拟Agent返回结果。仅用于测试/开发环境。
|
||
|
||
在没有dispatcher的环境下使用,用于测试和调试Pipeline定义。
|
||
如果在生产环境中触发,则记录 ERROR 级别告警。
|
||
|
||
Args:
|
||
stage: 阶段定义
|
||
resolved_inputs: 已解析的输入参数
|
||
|
||
Returns:
|
||
模拟的StageResult
|
||
"""
|
||
import os
|
||
if os.environ.get("ENV", "development") == "production":
|
||
logger.error(
|
||
f"Pipeline 进入 dry-run 模式(stage={stage.name})!"
|
||
"生产环境中 TaskDispatcher 未正确初始化,请检查系统配置。"
|
||
)
|
||
else:
|
||
logger.warning(f"[DRY-RUN] stage={stage.name} 返回模拟输出")
|
||
|
||
# 为声明的输出变量生成模拟值
|
||
mock_outputs: dict[str, Any] = {}
|
||
for output_name in stage.outputs:
|
||
mock_outputs[output_name] = f"[dry-run] {output_name} from {stage.agent}.{stage.action}"
|
||
|
||
logger.info(
|
||
f"Stage '{stage.name}' dry-run completed "
|
||
f"(agent={stage.agent}, action={stage.action}, "
|
||
f"inputs={list(resolved_inputs.keys())}, "
|
||
f"outputs={stage.outputs})"
|
||
)
|
||
|
||
return StageResult(
|
||
stage_name=stage.name,
|
||
status=StageStatus.COMPLETED,
|
||
outputs=mock_outputs,
|
||
)
|
||
|
||
def _topological_sort(self, stages: list[PipelineStage]) -> list[PipelineStage]:
|
||
"""
|
||
拓扑排序(Kahn算法)。
|
||
|
||
根据depends_on依赖关系确定执行顺序。
|
||
无依赖的阶段先执行,有依赖的后执行。
|
||
支持并行执行无依赖关系的阶段(本实现为串行,但顺序正确)。
|
||
|
||
Args:
|
||
stages: PipelineStage列表
|
||
|
||
Returns:
|
||
排序后的PipelineStage列表
|
||
|
||
Raises:
|
||
ValueError: 存在循环依赖
|
||
"""
|
||
name_to_stage = {s.name: s for s in stages}
|
||
stage_names = set(name_to_stage.keys())
|
||
|
||
# 构建邻接表和入度表
|
||
in_degree: dict[str, int] = {name: 0 for name in stage_names}
|
||
adj: dict[str, list[str]] = {name: [] for name in stage_names}
|
||
|
||
for stage in stages:
|
||
for dep in stage.depends_on:
|
||
if dep in stage_names:
|
||
adj[dep].append(stage.name)
|
||
in_degree[stage.name] += 1
|
||
|
||
# 入度为0的节点入队
|
||
queue: list[str] = sorted(
|
||
[name for name, deg in in_degree.items() if deg == 0]
|
||
)
|
||
sorted_names: list[str] = []
|
||
|
||
while queue:
|
||
# 按名称排序保证确定性
|
||
node = queue.pop(0)
|
||
sorted_names.append(node)
|
||
|
||
for neighbor in sorted(adj[node]):
|
||
in_degree[neighbor] -= 1
|
||
if in_degree[neighbor] == 0:
|
||
queue.append(neighbor)
|
||
queue.sort() # 保证确定性顺序
|
||
|
||
if len(sorted_names) != len(stage_names):
|
||
raise ValueError(
|
||
f"Cyclic dependency detected in pipeline stages: "
|
||
f"only {len(sorted_names)}/{len(stage_names)} stages sorted"
|
||
)
|
||
|
||
return [name_to_stage[name] for name in sorted_names]
|
||
|
||
def _resolve_stage_inputs(
|
||
self,
|
||
inputs: dict[str, Any],
|
||
context: dict[str, Any],
|
||
) -> dict[str, Any]:
|
||
"""
|
||
解析stage的输入变量。
|
||
|
||
将inputs中的${...}变量引用替换为context中的实际值。
|
||
|
||
Args:
|
||
inputs: 原始输入参数(可能包含变量引用)
|
||
context: 完整上下文(全局变量 + stages上下文)
|
||
|
||
Returns:
|
||
解析后的输入参数
|
||
"""
|
||
return PipelineLoader.resolve_variables(inputs, context)
|
||
|
||
def _should_skip(
|
||
self,
|
||
stage: PipelineStage,
|
||
failed_stages: set[str],
|
||
skipped_stages: set[str],
|
||
) -> bool:
|
||
"""
|
||
判断是否应该跳过此阶段。
|
||
|
||
如果某个依赖的阶段失败了且不是continue_on_failure,
|
||
则此阶段应跳过。
|
||
|
||
Args:
|
||
stage: 阶段定义
|
||
failed_stages: 已失败的阶段集合
|
||
skipped_stages: 已跳过的阶段集合
|
||
|
||
Returns:
|
||
True如果应该跳过
|
||
"""
|
||
for dep in stage.depends_on:
|
||
if dep in failed_stages or dep in skipped_stages:
|
||
return True
|
||
return False
|
||
|
||
def _evaluate_condition(
|
||
self,
|
||
condition: str,
|
||
exec_context: dict[str, Any],
|
||
stages_context: dict[str, Any],
|
||
) -> bool:
|
||
"""
|
||
评估条件表达式。
|
||
|
||
当前支持简单的变量存在性检查和等值比较:
|
||
- "${var}" → 变量存在且非空
|
||
- "${var} == 'value'" → 变量等于指定值
|
||
- "${var} != 'value'" → 变量不等于指定值
|
||
|
||
Args:
|
||
condition: 条件表达式字符串
|
||
exec_context: 全局执行上下文
|
||
stages_context: 阶段结果上下文
|
||
|
||
Returns:
|
||
True如果条件满足
|
||
"""
|
||
resolve_ctx = dict(exec_context)
|
||
resolve_ctx["stages"] = stages_context
|
||
|
||
# 先解析变量
|
||
resolved = PipelineLoader.resolve_variables(condition, resolve_ctx)
|
||
|
||
if isinstance(resolved, bool):
|
||
return resolved
|
||
|
||
resolved_str = str(resolved).strip()
|
||
|
||
# 等值比较
|
||
if "==" in resolved_str:
|
||
left, right = resolved_str.split("==", 1)
|
||
return left.strip().strip("'\"") == right.strip().strip("'\"")
|
||
|
||
# 不等比较
|
||
if "!=" in resolved_str:
|
||
left, right = resolved_str.split("!=", 1)
|
||
return left.strip().strip("'\"") != right.strip().strip("'\"")
|
||
|
||
# 非空判断
|
||
return bool(resolved_str) and resolved_str.lower() not in ("false", "0", "none", "")
|
||
|
||
def _extract_outputs(
|
||
self,
|
||
stage: PipelineStage,
|
||
output_data: dict[str, Any] | None,
|
||
) -> dict[str, Any]:
|
||
"""
|
||
从Agent的output_data中提取stage声明的输出变量。
|
||
|
||
Args:
|
||
stage: 阶段定义
|
||
output_data: Agent返回的完整输出数据
|
||
|
||
Returns:
|
||
仅包含声明输出的字典
|
||
"""
|
||
if not output_data:
|
||
return {}
|
||
|
||
if not stage.outputs:
|
||
# 如果未声明outputs,返回全部output_data
|
||
return output_data
|
||
|
||
return {
|
||
key: output_data.get(key)
|
||
for key in stage.outputs
|
||
if key in output_data
|
||
}
|