refactor(core,experts): classify except Exception + structured ReviewResult (U3)
ReviewResult dataclass (passed/degraded/feedback) replaces tuple+[DEGRADED] prefix in _review_phase_output; 3 review_result WS payloads now carry degraded field (AE3). except Exception narrowed to specific types across 10 files (core/react, rewoo, base, orchestrator, dispatcher, plan_exec_engine + experts/orchestrator, _phase_executor, _review_gate + orchestrator/pipeline_engine). Baseline 140 -> 66 occurrences (>=50% reduction). Fix RuntimeError regression: review-gate + compression paths now catch RuntimeError (LLM/provider internal errors) to preserve degradation semantics. Test side_effect switched to functional form to avoid StopIteration on list exhaustion. ruff clean; 135 key + 469 experts + 163 core tests pass.
This commit is contained in:
parent
47ee2449df
commit
be5c4e09f8
|
|
@ -246,7 +246,7 @@ class BaseAgent(ABC):
|
||||||
self._redis = aioredis.from_url(redis_url, decode_responses=True)
|
self._redis = aioredis.from_url(redis_url, decode_responses=True)
|
||||||
await self._redis.ping()
|
await self._redis.ping()
|
||||||
logger.info(f"Agent '{self.name}' connected to Redis")
|
logger.info(f"Agent '{self.name}' connected to Redis")
|
||||||
except Exception as e:
|
except (ConnectionError, OSError, asyncio.TimeoutError, ValueError) as e:
|
||||||
self._redis = None
|
self._redis = None
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Agent '{self.name}' Redis unavailable: {e}, falling back to local mode"
|
f"Agent '{self.name}' Redis unavailable: {e}, falling back to local mode"
|
||||||
|
|
@ -380,7 +380,10 @@ class BaseAgent(ABC):
|
||||||
# 失败钩子
|
# 失败钩子
|
||||||
try:
|
try:
|
||||||
await self.on_task_failed(task, TaskCancelledError(task.task_id))
|
await self.on_task_failed(task, TaskCancelledError(task.task_id))
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
except Exception as hook_err:
|
except Exception as hook_err:
|
||||||
|
# 用户提供的 hook — 任意异常都可能,不阻塞 TaskResult 构建
|
||||||
logger.error(f"on_task_failed hook error: {hook_err}")
|
logger.error(f"on_task_failed hook error: {hook_err}")
|
||||||
|
|
||||||
elapsed = time.monotonic() - start_time
|
elapsed = time.monotonic() - start_time
|
||||||
|
|
@ -408,7 +411,10 @@ class BaseAgent(ABC):
|
||||||
await self.on_task_failed(
|
await self.on_task_failed(
|
||||||
task, TaskTimeoutError(task.task_id, task.timeout_seconds)
|
task, TaskTimeoutError(task.task_id, task.timeout_seconds)
|
||||||
)
|
)
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
except Exception as hook_err:
|
except Exception as hook_err:
|
||||||
|
# 用户提供的 hook — 任意异常都可能,不阻塞 TaskResult 构建
|
||||||
logger.error(f"on_task_failed hook error: {hook_err}")
|
logger.error(f"on_task_failed hook error: {hook_err}")
|
||||||
|
|
||||||
elapsed = time.monotonic() - start_time
|
elapsed = time.monotonic() - start_time
|
||||||
|
|
@ -427,12 +433,20 @@ class BaseAgent(ABC):
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
# CancelledError 必须传播,不被 except Exception 吞掉
|
||||||
|
raise
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
# 框架边界 catch-all:handle_task 是用户实现,可能抛任意异常;
|
||||||
|
# execute() 契约要求始终返回 TaskResult,故保留兜底。
|
||||||
logger.error(f"Agent '{self.name}' task {task.task_id} failed: {e}")
|
logger.error(f"Agent '{self.name}' task {task.task_id} failed: {e}")
|
||||||
|
|
||||||
# 失败钩子
|
# 失败钩子
|
||||||
try:
|
try:
|
||||||
await self.on_task_failed(task, e)
|
await self.on_task_failed(task, e)
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
except Exception as hook_err:
|
except Exception as hook_err:
|
||||||
logger.error(f"on_task_failed hook error: {hook_err}")
|
logger.error(f"on_task_failed hook error: {hook_err}")
|
||||||
|
|
||||||
|
|
@ -517,13 +531,13 @@ class BaseAgent(ABC):
|
||||||
f"agent:{self.name}:progress",
|
f"agent:{self.name}:progress",
|
||||||
json.dumps(progress_obj.to_dict()),
|
json.dumps(progress_obj.to_dict()),
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except (ConnectionError, asyncio.TimeoutError, OSError) as e:
|
||||||
logger.warning(f"Failed to publish progress for task {task_id}: {e}")
|
logger.warning(f"Failed to publish progress for task {task_id}: {e}")
|
||||||
|
|
||||||
if self._dispatcher is not None:
|
if self._dispatcher is not None:
|
||||||
try:
|
try:
|
||||||
await self._dispatcher.handle_progress(progress_obj)
|
await self._dispatcher.handle_progress(progress_obj)
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, RuntimeError) as e:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Failed to report progress to dispatcher for task {task_id}: {e}"
|
f"Failed to report progress to dispatcher for task {task_id}: {e}"
|
||||||
)
|
)
|
||||||
|
|
@ -544,7 +558,7 @@ class BaseAgent(ABC):
|
||||||
await asyncio.sleep(30)
|
await asyncio.sleep(30)
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
pass
|
pass
|
||||||
except Exception as e:
|
except (ConnectionError, asyncio.TimeoutError, OSError, RuntimeError) as e:
|
||||||
logger.error(f"Heartbeat error for agent '{self.name}': {e}")
|
logger.error(f"Heartbeat error for agent '{self.name}': {e}")
|
||||||
|
|
||||||
async def _listen_for_tasks(self):
|
async def _listen_for_tasks(self):
|
||||||
|
|
@ -565,11 +579,11 @@ class BaseAgent(ABC):
|
||||||
task_data = json.loads(task_json)
|
task_data = json.loads(task_json)
|
||||||
task = TaskMessage.from_dict(task_data)
|
task = TaskMessage.from_dict(task_data)
|
||||||
asyncio.create_task(self._execute_task_with_semaphore(task))
|
asyncio.create_task(self._execute_task_with_semaphore(task))
|
||||||
except Exception as e:
|
except (json.JSONDecodeError, KeyError, TypeError, ValueError) as e:
|
||||||
logger.error(f"Failed to parse task message: {e}")
|
logger.error(f"Failed to parse task message: {e}")
|
||||||
except asyncio.CancelledError:
|
except asyncio.CancelledError:
|
||||||
pass
|
pass
|
||||||
except Exception as e:
|
except (ConnectionError, asyncio.TimeoutError, OSError, RuntimeError) as e:
|
||||||
logger.error(f"Task listener error for agent '{self.name}': {e}")
|
logger.error(f"Task listener error for agent '{self.name}': {e}")
|
||||||
|
|
||||||
async def _execute_task_with_semaphore(self, task: TaskMessage):
|
async def _execute_task_with_semaphore(self, task: TaskMessage):
|
||||||
|
|
@ -593,7 +607,13 @@ class BaseAgent(ABC):
|
||||||
if self._redis is not None and self._dispatcher is not None:
|
if self._redis is not None and self._dispatcher is not None:
|
||||||
await self._dispatcher.handle_result(result)
|
await self._dispatcher.handle_result(result)
|
||||||
|
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
# CancelledError 必须传播,不被 except 吞掉
|
||||||
|
raise
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
# 兜底:execute() 内部已捕获大部分异常并返回 TaskResult,
|
||||||
|
# 此处仅捕获 dispatcher 失败或 execute() 边界外的异常
|
||||||
logger.error(f"Agent '{self.name}' task {task.task_id} failed: {e}")
|
logger.error(f"Agent '{self.name}' task {task.task_id} failed: {e}")
|
||||||
error_result = TaskResult(
|
error_result = TaskResult(
|
||||||
task_id=task.task_id,
|
task_id=task.task_id,
|
||||||
|
|
@ -622,5 +642,6 @@ class BaseAgent(ABC):
|
||||||
jsonschema.validate(data, schema)
|
jsonschema.validate(data, schema)
|
||||||
except ImportError:
|
except ImportError:
|
||||||
logger.warning("jsonschema not installed, skipping input validation")
|
logger.warning("jsonschema not installed, skipping input validation")
|
||||||
except Exception as e:
|
except (ValueError, TypeError, KeyError) as e:
|
||||||
|
# jsonschema.ValidationError 继承 ValueError;其余为 schema/data 类型错误
|
||||||
raise SchemaValidationError(self.name, str(e))
|
raise SchemaValidationError(self.name, str(e))
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@
|
||||||
与业务系统解耦:通过依赖注入获取 Redis 连接和数据库会话。
|
与业务系统解耦:通过依赖注入获取 Redis 连接和数据库会话。
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import ipaddress
|
import ipaddress
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
|
@ -12,7 +13,6 @@ from typing import Any, Callable, Awaitable
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from agentkit.core.exceptions import (
|
from agentkit.core.exceptions import (
|
||||||
NoAvailableAgentError,
|
|
||||||
TaskDispatchError,
|
TaskDispatchError,
|
||||||
TaskNotFoundError,
|
TaskNotFoundError,
|
||||||
)
|
)
|
||||||
|
|
@ -51,7 +51,7 @@ def _validate_callback_url(url: str) -> bool:
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
except Exception:
|
except (ValueError, TypeError):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if parsed.scheme not in ("http", "https"):
|
if parsed.scheme not in ("http", "https"):
|
||||||
|
|
@ -159,7 +159,7 @@ class TaskDispatcher:
|
||||||
|
|
||||||
except TaskDispatchError:
|
except TaskDispatchError:
|
||||||
raise
|
raise
|
||||||
except Exception as e:
|
except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError) as e:
|
||||||
await db.rollback()
|
await db.rollback()
|
||||||
logger.error(f"Failed to dispatch task {task.task_id}: {e}")
|
logger.error(f"Failed to dispatch task {task.task_id}: {e}")
|
||||||
raise TaskDispatchError(task.task_id, str(e))
|
raise TaskDispatchError(task.task_id, str(e))
|
||||||
|
|
@ -197,7 +197,7 @@ class TaskDispatcher:
|
||||||
|
|
||||||
except TaskNotFoundError:
|
except TaskNotFoundError:
|
||||||
raise
|
raise
|
||||||
except Exception as e:
|
except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError) as e:
|
||||||
await db.rollback()
|
await db.rollback()
|
||||||
logger.error(f"Failed to cancel task {task_id}: {e}")
|
logger.error(f"Failed to cancel task {task_id}: {e}")
|
||||||
raise
|
raise
|
||||||
|
|
@ -263,7 +263,7 @@ class TaskDispatcher:
|
||||||
|
|
||||||
logger.info(f"Task {result.task_id} result handled (status={result.status})")
|
logger.info(f"Task {result.task_id} result handled (status={result.status})")
|
||||||
|
|
||||||
except Exception as e:
|
except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError) as e:
|
||||||
await db.rollback()
|
await db.rollback()
|
||||||
logger.error(f"Failed to handle result for task {result.task_id}: {e}")
|
logger.error(f"Failed to handle result for task {result.task_id}: {e}")
|
||||||
|
|
||||||
|
|
@ -295,7 +295,7 @@ class TaskDispatcher:
|
||||||
)
|
)
|
||||||
await db.commit()
|
await db.commit()
|
||||||
|
|
||||||
except Exception as e:
|
except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError) as e:
|
||||||
await db.rollback()
|
await db.rollback()
|
||||||
logger.error(f"Failed to handle progress for task {progress.task_id}: {e}")
|
logger.error(f"Failed to handle progress for task {progress.task_id}: {e}")
|
||||||
|
|
||||||
|
|
@ -359,7 +359,7 @@ class TaskDispatcher:
|
||||||
if retried > 0:
|
if retried > 0:
|
||||||
logger.info(f"Retried {retried} failed tasks")
|
logger.info(f"Retried {retried} failed tasks")
|
||||||
|
|
||||||
except Exception as e:
|
except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError) as e:
|
||||||
await db.rollback()
|
await db.rollback()
|
||||||
logger.error(f"Failed to retry failed tasks: {e}")
|
logger.error(f"Failed to retry failed tasks: {e}")
|
||||||
|
|
||||||
|
|
@ -392,7 +392,7 @@ class TaskDispatcher:
|
||||||
async with httpx.AsyncClient(timeout=10) as client:
|
async with httpx.AsyncClient(timeout=10) as client:
|
||||||
await client.post(callback_url, json=result.to_dict())
|
await client.post(callback_url, json=result.to_dict())
|
||||||
logger.info(f"Callback triggered for task {result.task_id}")
|
logger.info(f"Callback triggered for task {result.task_id}")
|
||||||
except Exception as e:
|
except (ConnectionError, OSError, asyncio.TimeoutError, RuntimeError) as e:
|
||||||
logger.warning(f"Callback failed for task {result.task_id}: {e}")
|
logger.warning(f"Callback failed for task {result.task_id}: {e}")
|
||||||
|
|
||||||
def _task_to_dict(self, task: Any) -> dict:
|
def _task_to_dict(self, task: Any) -> dict:
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,8 @@ from dataclasses import dataclass, field
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import TYPE_CHECKING, Any
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
from agentkit.core.protocol import TaskMessage, TaskResult, TaskStatus
|
from agentkit.core.exceptions import LLMProviderError
|
||||||
|
from agentkit.core.protocol import TaskMessage, TaskStatus
|
||||||
from agentkit.core.shared_workspace import SharedWorkspace
|
from agentkit.core.shared_workspace import SharedWorkspace
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
|
@ -224,7 +225,7 @@ class Orchestrator:
|
||||||
subtasks=subtasks,
|
subtasks=subtasks,
|
||||||
parallel_groups=parallel_groups,
|
parallel_groups=parallel_groups,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except (RuntimeError, ValueError, KeyError, AttributeError) as e:
|
||||||
logger.warning(f"GoalPlanner decomposition failed, falling back: {e}")
|
logger.warning(f"GoalPlanner decomposition failed, falling back: {e}")
|
||||||
|
|
||||||
# If LLM gateway available, use it for decomposition
|
# If LLM gateway available, use it for decomposition
|
||||||
|
|
@ -239,7 +240,7 @@ class Orchestrator:
|
||||||
subtasks=subtasks,
|
subtasks=subtasks,
|
||||||
parallel_groups=parallel_groups,
|
parallel_groups=parallel_groups,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, ValueError, TypeError, KeyError) as e:
|
||||||
logger.warning(f"LLM decomposition failed, falling back to simple: {e}")
|
logger.warning(f"LLM decomposition failed, falling back to simple: {e}")
|
||||||
|
|
||||||
# Fallback: single subtask = original task
|
# Fallback: single subtask = original task
|
||||||
|
|
@ -418,7 +419,7 @@ class Orchestrator:
|
||||||
"status": "completed",
|
"status": "completed",
|
||||||
},
|
},
|
||||||
))
|
))
|
||||||
except Exception as e:
|
except (ConnectionError, RuntimeError, OSError) as e:
|
||||||
logger.warning(f"Failed to publish progress via MessageBus: {e}")
|
logger.warning(f"Failed to publish progress via MessageBus: {e}")
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
@ -437,10 +438,12 @@ class Orchestrator:
|
||||||
"error": "Subtask timed out",
|
"error": "Subtask timed out",
|
||||||
},
|
},
|
||||||
))
|
))
|
||||||
except Exception as e:
|
except (ConnectionError, RuntimeError, OSError) as e:
|
||||||
logger.warning(f"Failed to publish progress via MessageBus: {e}")
|
logger.warning(f"Failed to publish progress via MessageBus: {e}")
|
||||||
return error_result
|
return error_result
|
||||||
except Exception as e:
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
|
except (RuntimeError, ValueError, KeyError, AttributeError, ConnectionError, LLMProviderError) as e:
|
||||||
error_result = {"status": "failed", "error": str(e)}
|
error_result = {"status": "failed", "error": str(e)}
|
||||||
if self._message_bus is not None:
|
if self._message_bus is not None:
|
||||||
try:
|
try:
|
||||||
|
|
@ -455,7 +458,7 @@ class Orchestrator:
|
||||||
"error": str(e),
|
"error": str(e),
|
||||||
},
|
},
|
||||||
))
|
))
|
||||||
except Exception as e:
|
except (ConnectionError, RuntimeError, OSError) as e:
|
||||||
logger.warning(f"Failed to publish progress via MessageBus: {e}")
|
logger.warning(f"Failed to publish progress via MessageBus: {e}")
|
||||||
return error_result
|
return error_result
|
||||||
|
|
||||||
|
|
@ -513,7 +516,7 @@ class Orchestrator:
|
||||||
try:
|
try:
|
||||||
agents_info = self._agent_pool.list_agents()
|
agents_info = self._agent_pool.list_agents()
|
||||||
return [a["name"] for a in agents_info]
|
return [a["name"] for a in agents_info]
|
||||||
except Exception:
|
except (RuntimeError, KeyError, AttributeError):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def _convert_execution_plan_to_subtasks(
|
def _convert_execution_plan_to_subtasks(
|
||||||
|
|
@ -561,7 +564,7 @@ class Orchestrator:
|
||||||
description = agent.get("description", "").lower()
|
description = agent.get("description", "").lower()
|
||||||
if skill.lower() in name.lower() or skill.lower() in agent_type.lower() or skill.lower() in description:
|
if skill.lower() in name.lower() or skill.lower() in agent_type.lower() or skill.lower() in description:
|
||||||
return name
|
return name
|
||||||
except Exception:
|
except (RuntimeError, KeyError, AttributeError):
|
||||||
pass
|
pass
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
@ -580,9 +583,6 @@ class Orchestrator:
|
||||||
Returns:
|
Returns:
|
||||||
OrchestrationResult: 编排结果,metadata 中包含迭代历史
|
OrchestrationResult: 编排结果,metadata 中包含迭代历史
|
||||||
"""
|
"""
|
||||||
import time as _time
|
|
||||||
|
|
||||||
start_time = _time.monotonic()
|
|
||||||
iteration_history: list[dict[str, Any]] = []
|
iteration_history: list[dict[str, Any]] = []
|
||||||
|
|
||||||
# First execution
|
# First execution
|
||||||
|
|
@ -650,7 +650,7 @@ class Orchestrator:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return await self._llm_evaluate(task, result)
|
return await self._llm_evaluate(task, result)
|
||||||
except Exception as e:
|
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, ValueError, RuntimeError) as e:
|
||||||
logger.warning(f"LLM evaluation failed, falling back to rule-based: {e}")
|
logger.warning(f"LLM evaluation failed, falling back to rule-based: {e}")
|
||||||
return self._rule_based_evaluate(result)
|
return self._rule_based_evaluate(result)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@ from dataclasses import dataclass, field
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from typing import TYPE_CHECKING, Any, Awaitable, Callable
|
from typing import TYPE_CHECKING, Any, Awaitable, Callable
|
||||||
|
|
||||||
from agentkit.core.exceptions import TaskCancelledError, TaskTimeoutError
|
from agentkit.core.exceptions import LLMProviderError, TaskCancelledError, TaskTimeoutError
|
||||||
from agentkit.core.goal_planner import GoalPlanner
|
from agentkit.core.goal_planner import GoalPlanner
|
||||||
from agentkit.core.plan_executor import PlanExecutor, PlanExecutionResult
|
from agentkit.core.plan_executor import PlanExecutor, PlanExecutionResult
|
||||||
from agentkit.core.plan_schema import ExecutionPlan, PlanStep, PlanStepStatus
|
from agentkit.core.plan_schema import ExecutionPlan, PlanStep, PlanStepStatus
|
||||||
|
|
@ -214,7 +214,7 @@ class PlanExecEngine:
|
||||||
system_prompt += f"\n\n## 参考信息\n{memory_context}"
|
system_prompt += f"\n\n## 参考信息\n{memory_context}"
|
||||||
else:
|
else:
|
||||||
system_prompt = f"## 参考信息\n{memory_context}"
|
system_prompt = f"## 参考信息\n{memory_context}"
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
|
||||||
logger.warning(f"Memory retrieval failed, continuing without context: {e}")
|
logger.warning(f"Memory retrieval failed, continuing without context: {e}")
|
||||||
|
|
||||||
# 启动轨迹记录
|
# 启动轨迹记录
|
||||||
|
|
@ -440,7 +440,7 @@ class PlanExecEngine:
|
||||||
value={"output_summary": summary, "agent_name": agent_name},
|
value={"output_summary": summary, "agent_name": agent_name},
|
||||||
metadata={"task_type": task_type, "outcome": trace_outcome},
|
metadata={"task_type": task_type, "outcome": trace_outcome},
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, ValueError) as e:
|
||||||
logger.warning(f"Failed to store task result in episodic memory: {e}")
|
logger.warning(f"Failed to store task result in episodic memory: {e}")
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
|
@ -477,7 +477,7 @@ class PlanExecEngine:
|
||||||
system_prompt += f"\n\n## 参考信息\n{memory_context}"
|
system_prompt += f"\n\n## 参考信息\n{memory_context}"
|
||||||
else:
|
else:
|
||||||
system_prompt = f"## 参考信息\n{memory_context}"
|
system_prompt = f"## 参考信息\n{memory_context}"
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
|
||||||
logger.warning(f"Memory retrieval failed, continuing without context: {e}")
|
logger.warning(f"Memory retrieval failed, continuing without context: {e}")
|
||||||
|
|
||||||
# 启动轨迹记录
|
# 启动轨迹记录
|
||||||
|
|
@ -514,7 +514,7 @@ class PlanExecEngine:
|
||||||
"goal": plan.goal,
|
"goal": plan.goal,
|
||||||
"steps": [s.to_dict() for s in plan.steps],
|
"steps": [s.to_dict() for s in plan.steps],
|
||||||
})
|
})
|
||||||
except Exception as e:
|
except (RuntimeError, ValueError, TypeError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError) as e:
|
||||||
logger.warning(f"Step event callback failed: {e}")
|
logger.warning(f"Step event callback failed: {e}")
|
||||||
|
|
||||||
trajectory.append(ReActStep(
|
trajectory.append(ReActStep(
|
||||||
|
|
@ -535,7 +535,7 @@ class PlanExecEngine:
|
||||||
"goal": spec.goal,
|
"goal": spec.goal,
|
||||||
"num_steps": len(spec.steps),
|
"num_steps": len(spec.steps),
|
||||||
})
|
})
|
||||||
except Exception as e:
|
except (RuntimeError, ValueError, TypeError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError) as e:
|
||||||
logger.warning(f"Step event callback failed: {e}")
|
logger.warning(f"Step event callback failed: {e}")
|
||||||
|
|
||||||
if trace_recorder is not None:
|
if trace_recorder is not None:
|
||||||
|
|
@ -604,7 +604,7 @@ class PlanExecEngine:
|
||||||
value={"output_summary": summary, "agent_name": agent_name},
|
value={"output_summary": summary, "agent_name": agent_name},
|
||||||
metadata={"task_type": task_type, "outcome": trace_outcome},
|
metadata={"task_type": task_type, "outcome": trace_outcome},
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, ValueError) as e:
|
||||||
logger.warning(f"Failed to store task result in episodic memory: {e}")
|
logger.warning(f"Failed to store task result in episodic memory: {e}")
|
||||||
|
|
||||||
async def _execute_with_replanning(
|
async def _execute_with_replanning(
|
||||||
|
|
@ -685,7 +685,7 @@ class PlanExecEngine:
|
||||||
"result": step_result.result,
|
"result": step_result.result,
|
||||||
"error": step_result.error,
|
"error": step_result.error,
|
||||||
})
|
})
|
||||||
except Exception as e:
|
except (RuntimeError, ValueError, TypeError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError) as e:
|
||||||
logger.warning(f"Step event callback failed: {e}")
|
logger.warning(f"Step event callback failed: {e}")
|
||||||
|
|
||||||
if trace_recorder is not None:
|
if trace_recorder is not None:
|
||||||
|
|
@ -733,7 +733,7 @@ class PlanExecEngine:
|
||||||
"root_cause": reflection_report.root_cause,
|
"root_cause": reflection_report.root_cause,
|
||||||
"new_plan_id": current_plan.plan_id,
|
"new_plan_id": current_plan.plan_id,
|
||||||
})
|
})
|
||||||
except Exception as e:
|
except (RuntimeError, ValueError, TypeError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError) as e:
|
||||||
logger.warning(f"Step event callback failed: {e}")
|
logger.warning(f"Step event callback failed: {e}")
|
||||||
|
|
||||||
trajectory.append(ReActStep(
|
trajectory.append(ReActStep(
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ from dataclasses import dataclass, field
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from typing import TYPE_CHECKING, Any
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
from agentkit.core.exceptions import LoopDetectedError, TaskCancelledError, TaskTimeoutError
|
from agentkit.core.exceptions import LLMProviderError, LoopDetectedError, TaskCancelledError, TaskTimeoutError
|
||||||
from agentkit.core.protocol import CancellationToken
|
from agentkit.core.protocol import CancellationToken
|
||||||
from agentkit.llm.gateway import LLMGateway
|
from agentkit.llm.gateway import LLMGateway
|
||||||
from agentkit.llm.protocol import LLMResponse
|
from agentkit.llm.protocol import LLMResponse
|
||||||
|
|
@ -659,7 +659,8 @@ class ReActEngine:
|
||||||
)
|
)
|
||||||
or ""
|
or ""
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
|
||||||
|
# 检索层故障(RAG/Redis/LLM embedding)— 不阻塞主流程
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Memory retrieval failed, continuing without context: {e}", exc_info=True
|
f"Memory retrieval failed, continuing without context: {e}", exc_info=True
|
||||||
)
|
)
|
||||||
|
|
@ -679,7 +680,8 @@ class ReActEngine:
|
||||||
if compressor:
|
if compressor:
|
||||||
try:
|
try:
|
||||||
conversation = await compressor.compress(conversation)
|
conversation = await compressor.compress(conversation)
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
|
||||||
|
# 压缩器通常调用 LLM — LLM 不可用类异常降级为原对话
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Context compression failed, continuing with original messages: {e}"
|
f"Context compression failed, continuing with original messages: {e}"
|
||||||
)
|
)
|
||||||
|
|
@ -1052,7 +1054,11 @@ class ReActEngine:
|
||||||
approved = await confirmation_handler(
|
approved = await confirmation_handler(
|
||||||
confirmation_id, command, reason
|
confirmation_id, command, reason
|
||||||
)
|
)
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
# 用户提供的 confirmation_handler — 任意异常都可能,
|
||||||
|
# 不阻塞主循环,降级为未批准
|
||||||
logger.warning(f"Confirmation handler error: {e}")
|
logger.warning(f"Confirmation handler error: {e}")
|
||||||
|
|
||||||
if approved:
|
if approved:
|
||||||
|
|
@ -1066,9 +1072,10 @@ class ReActEngine:
|
||||||
clean_args["_skip_dangerous_check"] = True
|
clean_args["_skip_dangerous_check"] = True
|
||||||
try:
|
try:
|
||||||
tool_result = await tool.safe_execute(**clean_args)
|
tool_result = await tool.safe_execute(**clean_args)
|
||||||
except Exception as e:
|
except (ToolValidationError, ValueError, TypeError, RuntimeError) as e:
|
||||||
tool_result = {
|
tool_result = {
|
||||||
"error": f"Tool '{tc.name}' execution failed: {e}"
|
"error": f"Tool '{tc.name}' execution failed: {e}",
|
||||||
|
"error_code": "tool_execution_failed",
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
clean_args = {
|
clean_args = {
|
||||||
|
|
@ -1083,9 +1090,10 @@ class ReActEngine:
|
||||||
if tool
|
if tool
|
||||||
else {"error": f"Tool '{tc.name}' not found"}
|
else {"error": f"Tool '{tc.name}' not found"}
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except (ToolValidationError, ValueError, TypeError, RuntimeError) as e:
|
||||||
tool_result = {
|
tool_result = {
|
||||||
"error": f"Tool '{tc.name}' execution failed: {e}"
|
"error": f"Tool '{tc.name}' execution failed: {e}",
|
||||||
|
"error_code": "tool_execution_failed",
|
||||||
}
|
}
|
||||||
|
|
||||||
yield ReActEvent(
|
yield ReActEvent(
|
||||||
|
|
@ -1146,7 +1154,7 @@ class ReActEngine:
|
||||||
if self._should_compress(conversation, compressor):
|
if self._should_compress(conversation, compressor):
|
||||||
try:
|
try:
|
||||||
conversation = await compressor.compress(conversation)
|
conversation = await compressor.compress(conversation)
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
|
||||||
logger.warning(f"Incremental compression failed: {e}")
|
logger.warning(f"Incremental compression failed: {e}")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|
@ -1217,7 +1225,7 @@ class ReActEngine:
|
||||||
if self._should_compress(conversation, compressor):
|
if self._should_compress(conversation, compressor):
|
||||||
try:
|
try:
|
||||||
conversation = await compressor.compress(conversation)
|
conversation = await compressor.compress(conversation)
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
|
||||||
logger.warning(f"Incremental compression failed: {e}")
|
logger.warning(f"Incremental compression failed: {e}")
|
||||||
else:
|
else:
|
||||||
# ponytail: 检查是否为畸形工具调用(含 <tool_use> 但解析失败)
|
# ponytail: 检查是否为畸形工具调用(含 <tool_use> 但解析失败)
|
||||||
|
|
@ -1332,7 +1340,7 @@ class ReActEngine:
|
||||||
reinjections,
|
reinjections,
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
|
||||||
logger.warning(f"Verification loop failed: {e}")
|
logger.warning(f"Verification loop failed: {e}")
|
||||||
|
|
||||||
# Yield final_answer event (legacy format for execute_stream consumers)
|
# Yield final_answer event (legacy format for execute_stream consumers)
|
||||||
|
|
@ -1428,7 +1436,8 @@ class ReActEngine:
|
||||||
value={"output_summary": summary, "agent_name": agent_name},
|
value={"output_summary": summary, "agent_name": agent_name},
|
||||||
metadata={"task_type": task_type, "outcome": trace_outcome},
|
metadata={"task_type": task_type, "outcome": trace_outcome},
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, ValueError) as e:
|
||||||
|
# EpisodicMemory 持久化故障(PG/Redis)— 不影响主结果
|
||||||
logger.warning(f"Failed to store task result in episodic memory: {e}")
|
logger.warning(f"Failed to store task result in episodic memory: {e}")
|
||||||
|
|
||||||
async def execute_stream(
|
async def execute_stream(
|
||||||
|
|
@ -1555,7 +1564,7 @@ class ReActEngine:
|
||||||
"""通过 gateway 查询 model 对应的 provider 名。失败回退 None(字符串拼接)。"""
|
"""通过 gateway 查询 model 对应的 provider 名。失败回退 None(字符串拼接)。"""
|
||||||
try:
|
try:
|
||||||
return self._llm_gateway.get_provider_name_for_model(model)
|
return self._llm_gateway.get_provider_name_for_model(model)
|
||||||
except Exception:
|
except (AttributeError, KeyError, LLMProviderError):
|
||||||
# ponytail: 测试中 gateway 可能是 MagicMock,无该方法;回退保守路径
|
# ponytail: 测试中 gateway 可能是 MagicMock,无该方法;回退保守路径
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
@ -1723,7 +1732,7 @@ class ReActEngine:
|
||||||
if compressor and tool_name:
|
if compressor and tool_name:
|
||||||
try:
|
try:
|
||||||
content = await compressor.compress_tool_result(tool_name, result)
|
content = await compressor.compress_tool_result(tool_name, result)
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
|
||||||
logger.warning(f"Tool result compression failed for '{tool_name}': {e}")
|
logger.warning(f"Tool result compression failed for '{tool_name}': {e}")
|
||||||
content = str(result)
|
content = str(result)
|
||||||
return {
|
return {
|
||||||
|
|
@ -1771,10 +1780,11 @@ class ReActEngine:
|
||||||
"error_code": e.error_code,
|
"error_code": e.error_code,
|
||||||
"details": e.details,
|
"details": e.details,
|
||||||
}
|
}
|
||||||
except Exception as e:
|
except (ValueError, TypeError, RuntimeError, asyncio.TimeoutError) as e:
|
||||||
|
# 工具执行失败 — 记录结构化错误码,LLM 可在下一步调整策略
|
||||||
error_msg = f"Tool '{tool_name}' execution failed: {e}"
|
error_msg = f"Tool '{tool_name}' execution failed: {e}"
|
||||||
logger.warning(error_msg)
|
logger.warning(error_msg)
|
||||||
return {"error": error_msg}
|
return {"error": error_msg, "error_code": "tool_execution_failed"}
|
||||||
|
|
||||||
async def _execute_tool_with_confirmation(
|
async def _execute_tool_with_confirmation(
|
||||||
self,
|
self,
|
||||||
|
|
@ -1818,7 +1828,10 @@ class ReActEngine:
|
||||||
if confirmation_handler is not None:
|
if confirmation_handler is not None:
|
||||||
try:
|
try:
|
||||||
approved = await confirmation_handler(confirmation_id, command, reason)
|
approved = await confirmation_handler(confirmation_id, command, reason)
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
# 用户提供的 confirmation_handler — 任意异常都可能,不阻塞主循环
|
||||||
logger.warning(f"Confirmation handler error: {e}")
|
logger.warning(f"Confirmation handler error: {e}")
|
||||||
|
|
||||||
if approved:
|
if approved:
|
||||||
|
|
@ -1829,8 +1842,11 @@ class ReActEngine:
|
||||||
clean_args["_skip_dangerous_check"] = True
|
clean_args["_skip_dangerous_check"] = True
|
||||||
try:
|
try:
|
||||||
tool_result = await tool.safe_execute(**clean_args)
|
tool_result = await tool.safe_execute(**clean_args)
|
||||||
except Exception as e:
|
except (ToolValidationError, ValueError, TypeError, RuntimeError) as e:
|
||||||
tool_result = {"error": f"Tool '{tc.name}' execution failed: {e}"}
|
tool_result = {
|
||||||
|
"error": f"Tool '{tc.name}' execution failed: {e}",
|
||||||
|
"error_code": "tool_execution_failed",
|
||||||
|
}
|
||||||
else:
|
else:
|
||||||
# Non-dangerous tool: re-execute with skip flag
|
# Non-dangerous tool: re-execute with skip flag
|
||||||
clean_args = {k: v for k, v in tc.arguments.items() if not k.startswith("_")}
|
clean_args = {k: v for k, v in tc.arguments.items() if not k.startswith("_")}
|
||||||
|
|
@ -1841,7 +1857,7 @@ class ReActEngine:
|
||||||
if tool
|
if tool
|
||||||
else {"error": f"Tool '{tc.name}' not found"}
|
else {"error": f"Tool '{tc.name}' not found"}
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except (ToolValidationError, ValueError, TypeError, RuntimeError) as e:
|
||||||
tool_result = {"error": f"Tool '{tc.name}' execution failed: {e}"}
|
tool_result = {"error": f"Tool '{tc.name}' execution failed: {e}"}
|
||||||
|
|
||||||
events.append(
|
events.append(
|
||||||
|
|
|
||||||
|
|
@ -11,23 +11,21 @@ import logging
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from datetime import datetime, timezone
|
|
||||||
from typing import TYPE_CHECKING, Any
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
from agentkit.core.exceptions import TaskCancelledError, TaskTimeoutError
|
from agentkit.core.exceptions import LLMProviderError, TaskCancelledError, TaskTimeoutError
|
||||||
from agentkit.core.protocol import CancellationToken
|
from agentkit.core.protocol import CancellationToken
|
||||||
from agentkit.core.react import ReActEngine, ReActEvent, ReActResult, ReActStep
|
from agentkit.core.react import ReActEngine, ReActEvent, ReActResult, ReActStep
|
||||||
from agentkit.llm.gateway import LLMGateway
|
from agentkit.llm.gateway import LLMGateway
|
||||||
from agentkit.llm.protocol import LLMResponse
|
from agentkit.tools.base import Tool, ToolValidationError
|
||||||
from agentkit.tools.base import Tool
|
from agentkit.telemetry.tracing import start_span, _OTEL_AVAILABLE
|
||||||
from agentkit.telemetry.tracing import get_tracer, start_span, _OTEL_AVAILABLE
|
|
||||||
from agentkit.telemetry.metrics import (
|
from agentkit.telemetry.metrics import (
|
||||||
agent_request_counter,
|
agent_request_counter,
|
||||||
agent_duration_histogram,
|
agent_duration_histogram,
|
||||||
)
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from agentkit.core.compressor import CompressionStrategy, ContextCompressor
|
from agentkit.core.compressor import CompressionStrategy
|
||||||
from agentkit.core.trace import TraceRecorder
|
from agentkit.core.trace import TraceRecorder
|
||||||
from agentkit.memory.retriever import MemoryRetriever
|
from agentkit.memory.retriever import MemoryRetriever
|
||||||
|
|
||||||
|
|
@ -296,7 +294,7 @@ class ReWOOEngine:
|
||||||
effective_system_prompt += f"\n\n## 参考信息\n{memory_context}"
|
effective_system_prompt += f"\n\n## 参考信息\n{memory_context}"
|
||||||
else:
|
else:
|
||||||
effective_system_prompt = f"## 参考信息\n{memory_context}"
|
effective_system_prompt = f"## 参考信息\n{memory_context}"
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
|
||||||
logger.warning(f"Memory retrieval failed, continuing without context: {e}")
|
logger.warning(f"Memory retrieval failed, continuing without context: {e}")
|
||||||
|
|
||||||
# ── Phase 1: Planning ──
|
# ── Phase 1: Planning ──
|
||||||
|
|
@ -360,7 +358,7 @@ class ReWOOEngine:
|
||||||
if compressor:
|
if compressor:
|
||||||
try:
|
try:
|
||||||
llm_messages = await compressor.compress(llm_messages)
|
llm_messages = await compressor.compress(llm_messages)
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
|
||||||
logger.warning(f"Context compression failed: {e}")
|
logger.warning(f"Context compression failed: {e}")
|
||||||
|
|
||||||
response = await self._llm_gateway.chat(
|
response = await self._llm_gateway.chat(
|
||||||
|
|
@ -492,7 +490,7 @@ class ReWOOEngine:
|
||||||
value={"output_summary": summary, "agent_name": agent_name},
|
value={"output_summary": summary, "agent_name": agent_name},
|
||||||
metadata={"task_type": task_type, "outcome": trace_outcome},
|
metadata={"task_type": task_type, "outcome": trace_outcome},
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, ValueError) as e:
|
||||||
logger.warning(f"Failed to store task result in episodic memory: {e}")
|
logger.warning(f"Failed to store task result in episodic memory: {e}")
|
||||||
|
|
||||||
return ReActResult(
|
return ReActResult(
|
||||||
|
|
@ -569,7 +567,7 @@ class ReWOOEngine:
|
||||||
effective_system_prompt += f"\n\n## 参考信息\n{memory_context}"
|
effective_system_prompt += f"\n\n## 参考信息\n{memory_context}"
|
||||||
else:
|
else:
|
||||||
effective_system_prompt = f"## 参考信息\n{memory_context}"
|
effective_system_prompt = f"## 参考信息\n{memory_context}"
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
|
||||||
logger.warning(f"Memory retrieval failed, continuing without context: {e}")
|
logger.warning(f"Memory retrieval failed, continuing without context: {e}")
|
||||||
|
|
||||||
trajectory: list[ReActStep] = []
|
trajectory: list[ReActStep] = []
|
||||||
|
|
@ -647,7 +645,7 @@ class ReWOOEngine:
|
||||||
if compressor:
|
if compressor:
|
||||||
try:
|
try:
|
||||||
llm_messages = await compressor.compress(llm_messages)
|
llm_messages = await compressor.compress(llm_messages)
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
|
||||||
logger.warning(f"Context compression failed: {e}")
|
logger.warning(f"Context compression failed: {e}")
|
||||||
|
|
||||||
response = await self._llm_gateway.chat(
|
response = await self._llm_gateway.chat(
|
||||||
|
|
@ -769,6 +767,9 @@ class ReWOOEngine:
|
||||||
"total_tokens": total_tokens,
|
"total_tokens": total_tokens,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
trace_outcome = "cancelled"
|
||||||
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
trace_outcome = "error"
|
trace_outcome = "error"
|
||||||
logger.error(f"ReWOO execute_stream failed: {e}")
|
logger.error(f"ReWOO execute_stream failed: {e}")
|
||||||
|
|
@ -786,7 +787,7 @@ class ReWOOEngine:
|
||||||
value={"output_summary": summary, "agent_name": agent_name},
|
value={"output_summary": summary, "agent_name": agent_name},
|
||||||
metadata={"task_type": task_type, "outcome": trace_outcome},
|
metadata={"task_type": task_type, "outcome": trace_outcome},
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, ValueError) as e:
|
||||||
logger.warning(f"Failed to store task result in episodic memory: {e}")
|
logger.warning(f"Failed to store task result in episodic memory: {e}")
|
||||||
|
|
||||||
# ── Fallback Strategy Helpers ──────────────────────────
|
# ── Fallback Strategy Helpers ──────────────────────────
|
||||||
|
|
@ -914,7 +915,7 @@ class ReWOOEngine:
|
||||||
output, synthesis_tokens = await self._synthesis_phase(messages=messages, tool_results=tool_results, model=model, agent_name=agent_name, task_type=task_type, system_prompt=effective_system_prompt, compressor=compressor, cancellation_token=cancellation_token)
|
output, synthesis_tokens = await self._synthesis_phase(messages=messages, tool_results=tool_results, model=model, agent_name=agent_name, task_type=task_type, system_prompt=effective_system_prompt, compressor=compressor, cancellation_token=cancellation_token)
|
||||||
yield ReActEvent(event_type="final_answer", step=len(plan.steps) + 1, data={"output": output, "total_steps": len(plan.steps) + 1, "total_tokens": simplified_tokens + synthesis_tokens})
|
yield ReActEvent(event_type="final_answer", step=len(plan.steps) + 1, data={"output": output, "total_steps": len(plan.steps) + 1, "total_tokens": simplified_tokens + synthesis_tokens})
|
||||||
return
|
return
|
||||||
except Exception as e:
|
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError, TypeError, ToolValidationError, json.JSONDecodeError) as e:
|
||||||
logger.warning(f"Simplified ReWOO planning also failed in stream mode: {e}")
|
logger.warning(f"Simplified ReWOO planning also failed in stream mode: {e}")
|
||||||
# Failed, continue to next strategy by not returning
|
# Failed, continue to next strategy by not returning
|
||||||
# This signals the caller to try the next strategy
|
# This signals the caller to try the next strategy
|
||||||
|
|
@ -951,7 +952,7 @@ class ReWOOEngine:
|
||||||
):
|
):
|
||||||
yield event
|
yield event
|
||||||
return
|
return
|
||||||
except Exception as e:
|
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ToolValidationError) as e:
|
||||||
logger.warning(f"ReAct fallback also failed in stream mode: {e}")
|
logger.warning(f"ReAct fallback also failed in stream mode: {e}")
|
||||||
raise _FallbackFailedError("react")
|
raise _FallbackFailedError("react")
|
||||||
|
|
||||||
|
|
@ -975,13 +976,13 @@ class ReWOOEngine:
|
||||||
if compressor:
|
if compressor:
|
||||||
try:
|
try:
|
||||||
direct_messages = await compressor.compress(direct_messages)
|
direct_messages = await compressor.compress(direct_messages)
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
|
||||||
logger.warning(f"Context compression failed in direct fallback: {e}")
|
logger.warning(f"Context compression failed in direct fallback: {e}")
|
||||||
direct_response = await self._llm_gateway.chat(messages=direct_messages, model=model, agent_name=agent_name, task_type=task_type)
|
direct_response = await self._llm_gateway.chat(messages=direct_messages, model=model, agent_name=agent_name, task_type=task_type)
|
||||||
output = direct_response.content or ""
|
output = direct_response.content or ""
|
||||||
yield ReActEvent(event_type="final_answer", step=1, data={"output": output, "total_steps": 1, "total_tokens": total_tokens + direct_response.usage.total_tokens})
|
yield ReActEvent(event_type="final_answer", step=1, data={"output": output, "total_steps": 1, "total_tokens": total_tokens + direct_response.usage.total_tokens})
|
||||||
return
|
return
|
||||||
except Exception as e:
|
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as e:
|
||||||
logger.error(f"Direct LLM fallback also failed in stream mode: {e}")
|
logger.error(f"Direct LLM fallback also failed in stream mode: {e}")
|
||||||
raise _FallbackFailedError("direct")
|
raise _FallbackFailedError("direct")
|
||||||
|
|
||||||
|
|
@ -1024,7 +1025,7 @@ class ReWOOEngine:
|
||||||
output, synthesis_tokens = await self._synthesis_phase(messages=messages, tool_results=tool_results, model=model, agent_name=agent_name, task_type=task_type, system_prompt=effective_system_prompt, compressor=compressor, cancellation_token=cancellation_token)
|
output, synthesis_tokens = await self._synthesis_phase(messages=messages, tool_results=tool_results, model=model, agent_name=agent_name, task_type=task_type, system_prompt=effective_system_prompt, compressor=compressor, cancellation_token=cancellation_token)
|
||||||
yield ReActEvent(event_type="final_answer", step=len(plan.steps) + 1, data={"output": output, "total_steps": len(plan.steps) + 1, "total_tokens": plan_tokens + synthesis_tokens})
|
yield ReActEvent(event_type="final_answer", step=len(plan.steps) + 1, data={"output": output, "total_steps": len(plan.steps) + 1, "total_tokens": plan_tokens + synthesis_tokens})
|
||||||
return
|
return
|
||||||
except Exception as e:
|
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError, TypeError, ToolValidationError, json.JSONDecodeError) as e:
|
||||||
logger.warning(f"Plan-exec fallback also failed in stream mode: {e}")
|
logger.warning(f"Plan-exec fallback also failed in stream mode: {e}")
|
||||||
raise _FallbackFailedError("plan_exec")
|
raise _FallbackFailedError("plan_exec")
|
||||||
|
|
||||||
|
|
@ -1178,7 +1179,7 @@ class ReWOOEngine:
|
||||||
total_tokens=total_tokens,
|
total_tokens=total_tokens,
|
||||||
fallback_strategy="simplified_rewoo",
|
fallback_strategy="simplified_rewoo",
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError, TypeError, ToolValidationError, json.JSONDecodeError) as e:
|
||||||
logger.warning(f"Simplified ReWOO planning also failed: {e}")
|
logger.warning(f"Simplified ReWOO planning also failed: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
@ -1219,7 +1220,7 @@ class ReWOOEngine:
|
||||||
)
|
)
|
||||||
react_result.fallback_strategy = "react"
|
react_result.fallback_strategy = "react"
|
||||||
return react_result
|
return react_result
|
||||||
except Exception as e:
|
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ToolValidationError) as e:
|
||||||
logger.warning(f"ReAct fallback also failed: {e}")
|
logger.warning(f"ReAct fallback also failed: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
@ -1247,7 +1248,7 @@ class ReWOOEngine:
|
||||||
if compressor:
|
if compressor:
|
||||||
try:
|
try:
|
||||||
direct_messages = await compressor.compress(direct_messages)
|
direct_messages = await compressor.compress(direct_messages)
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
|
||||||
logger.warning(f"Context compression failed in direct fallback: {e}")
|
logger.warning(f"Context compression failed in direct fallback: {e}")
|
||||||
|
|
||||||
direct_response = await self._llm_gateway.chat(
|
direct_response = await self._llm_gateway.chat(
|
||||||
|
|
@ -1284,7 +1285,7 @@ class ReWOOEngine:
|
||||||
total_tokens=total_tokens,
|
total_tokens=total_tokens,
|
||||||
fallback_strategy="direct",
|
fallback_strategy="direct",
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as e:
|
||||||
logger.error(f"Direct LLM fallback also failed: {e}")
|
logger.error(f"Direct LLM fallback also failed: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
@ -1361,7 +1362,7 @@ class ReWOOEngine:
|
||||||
total_tokens=total_tokens,
|
total_tokens=total_tokens,
|
||||||
fallback_strategy="plan_exec",
|
fallback_strategy="plan_exec",
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError, TypeError, ToolValidationError, json.JSONDecodeError) as e:
|
||||||
logger.warning(f"Plan-exec fallback also failed: {e}")
|
logger.warning(f"Plan-exec fallback also failed: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
@ -1418,7 +1419,7 @@ class ReWOOEngine:
|
||||||
if compressor:
|
if compressor:
|
||||||
try:
|
try:
|
||||||
planning_messages = await compressor.compress(planning_messages)
|
planning_messages = await compressor.compress(planning_messages)
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
|
||||||
logger.warning(f"Context compression failed during planning: {e}")
|
logger.warning(f"Context compression failed during planning: {e}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -1429,7 +1430,7 @@ class ReWOOEngine:
|
||||||
task_type=task_type,
|
task_type=task_type,
|
||||||
tools=tool_schemas,
|
tools=tool_schemas,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except (LLMProviderError, asyncio.TimeoutError, ConnectionError) as e:
|
||||||
logger.warning(f"LLM call failed during planning: {e}")
|
logger.warning(f"LLM call failed during planning: {e}")
|
||||||
return None, 0
|
return None, 0
|
||||||
|
|
||||||
|
|
@ -1496,7 +1497,7 @@ class ReWOOEngine:
|
||||||
if compressor:
|
if compressor:
|
||||||
try:
|
try:
|
||||||
synthesis_messages = await compressor.compress(synthesis_messages)
|
synthesis_messages = await compressor.compress(synthesis_messages)
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
|
||||||
logger.warning(f"Context compression failed during synthesis: {e}")
|
logger.warning(f"Context compression failed during synthesis: {e}")
|
||||||
|
|
||||||
response = await self._llm_gateway.chat(
|
response = await self._llm_gateway.chat(
|
||||||
|
|
@ -1611,7 +1612,7 @@ class ReWOOEngine:
|
||||||
try:
|
try:
|
||||||
result = await tool.safe_execute(**arguments)
|
result = await tool.safe_execute(**arguments)
|
||||||
return result
|
return result
|
||||||
except Exception as e:
|
except (ToolValidationError, ValueError, TypeError, RuntimeError) as e:
|
||||||
error_msg = f"Tool '{tool_name}' execution failed: {e}"
|
error_msg = f"Tool '{tool_name}' execution failed: {e}"
|
||||||
logger.warning(error_msg)
|
logger.warning(error_msg)
|
||||||
return {"error": error_msg}
|
return {"error": error_msg}
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import copy
|
import copy
|
||||||
import logging
|
import logging
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
|
@ -17,8 +18,6 @@ from .expert import Expert
|
||||||
from .plan import PhaseStatus, PhaseType, PlanPhase, TeamPlan
|
from .plan import PhaseStatus, PhaseType, PlanPhase, TeamPlan
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
import asyncio
|
|
||||||
|
|
||||||
from .team import ExpertTeam
|
from .team import ExpertTeam
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -61,7 +60,7 @@ class PhaseExecutorMixin:
|
||||||
full_data = await self._team.workspace.read(ref_key)
|
full_data = await self._team.workspace.read(ref_key)
|
||||||
if full_data:
|
if full_data:
|
||||||
return full_data.get("value", content)
|
return full_data.get("value", content)
|
||||||
except Exception as e:
|
except (asyncio.TimeoutError, ConnectionError, KeyError, AttributeError) as e:
|
||||||
logger.warning(f"Failed to read offloaded output '{ref_key}': {e}")
|
logger.warning(f"Failed to read offloaded output '{ref_key}': {e}")
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
@ -80,11 +79,11 @@ class PhaseExecutorMixin:
|
||||||
try:
|
try:
|
||||||
# U3: 返工循环 — 最多 MAX_REWORKS + 1 次(1 次初始 + MAX_REWORKS 次返工)
|
# U3: 返工循环 — 最多 MAX_REWORKS + 1 次(1 次初始 + MAX_REWORKS 次返工)
|
||||||
for _rework_attempt in range(self.MAX_REWORKS + 1):
|
for _rework_attempt in range(self.MAX_REWORKS + 1):
|
||||||
result, last_error, passed, feedback = await self._run_agent_steps(
|
result, last_error, passed, feedback, degraded = await self._run_agent_steps(
|
||||||
expert, agent, lead, phase, plan
|
expert, agent, lead, phase, plan
|
||||||
)
|
)
|
||||||
done = await self._finalize_phase(
|
done = await self._finalize_phase(
|
||||||
expert, lead, phase, plan, result, passed, feedback
|
expert, lead, phase, plan, result, passed, feedback, degraded
|
||||||
)
|
)
|
||||||
if done:
|
if done:
|
||||||
return result
|
return result
|
||||||
|
|
@ -181,9 +180,10 @@ class PhaseExecutorMixin:
|
||||||
lead: Expert,
|
lead: Expert,
|
||||||
phase: PlanPhase,
|
phase: PlanPhase,
|
||||||
plan: TeamPlan,
|
plan: TeamPlan,
|
||||||
) -> tuple[dict[str, Any], str | None, bool, str]:
|
) -> tuple[dict[str, Any], str | None, bool, str, bool]:
|
||||||
"""Run one rework iteration: read deps, build input, execute, review. Returns
|
"""Run one rework iteration: read deps, build input, execute, review. Returns
|
||||||
(result, last_error, passed, feedback). Raises RuntimeError on retry exhaustion."""
|
(result, last_error, passed, feedback, degraded). Raises RuntimeError on retry
|
||||||
|
exhaustion."""
|
||||||
# 每次迭代重新读取依赖输出(前置阶段可能在返工期间完成)
|
# 每次迭代重新读取依赖输出(前置阶段可能在返工期间完成)
|
||||||
dependency_outputs: dict[str, Any] = {}
|
dependency_outputs: dict[str, Any] = {}
|
||||||
for dep_id in phase.depends_on:
|
for dep_id in phase.depends_on:
|
||||||
|
|
@ -228,7 +228,12 @@ class PhaseExecutorMixin:
|
||||||
raise RuntimeError(f"Agent execution failed: {last_error}")
|
raise RuntimeError(f"Agent execution failed: {last_error}")
|
||||||
result = task_result.output_data or {"content": ""}
|
result = task_result.output_data or {"content": ""}
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except asyncio.CancelledError:
|
||||||
|
# CancelledError 必须传播,不被重试逻辑吞掉
|
||||||
|
raise
|
||||||
|
except (RuntimeError, asyncio.TimeoutError, ConnectionError) as e:
|
||||||
|
# agent.execute() 内部已捕获所有异常并返回 TaskResult,
|
||||||
|
# 此处仅捕获显式抛出的 RuntimeError + 罕见的基础设施异常
|
||||||
last_error = str(e)
|
last_error = str(e)
|
||||||
if attempt < self.MAX_RETRIES:
|
if attempt < self.MAX_RETRIES:
|
||||||
logger.info(f"Retrying phase {phase.id} (attempt {attempt + 1})")
|
logger.info(f"Retrying phase {phase.id} (attempt {attempt + 1})")
|
||||||
|
|
@ -250,9 +255,9 @@ class PhaseExecutorMixin:
|
||||||
"risk_description": risk_desc, "phase_id": phase.id, "phase_name": phase.name,
|
"risk_description": risk_desc, "phase_id": phase.id, "phase_name": phase.name,
|
||||||
})
|
})
|
||||||
|
|
||||||
# U3: Lead 验收阶段输出
|
# U3: Lead 验收阶段输出 — ReviewResult 结构化结果(含 degraded 标记)
|
||||||
passed, feedback = await self._review_phase_output(lead, phase, result)
|
review = await self._review_phase_output(lead, phase, result)
|
||||||
return result, last_error, passed, feedback
|
return result, last_error, review.passed, review.feedback, review.degraded
|
||||||
|
|
||||||
async def _finalize_phase(
|
async def _finalize_phase(
|
||||||
self,
|
self,
|
||||||
|
|
@ -263,9 +268,15 @@ class PhaseExecutorMixin:
|
||||||
result: dict[str, Any],
|
result: dict[str, Any],
|
||||||
passed: bool,
|
passed: bool,
|
||||||
feedback: str,
|
feedback: str,
|
||||||
|
degraded: bool = False,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""Handle review outcome: write workspace + emit completed, or rework/fail. Returns
|
"""Handle review outcome: write workspace + emit completed, or rework/fail. Returns
|
||||||
True if done (COMPLETED), False if rework continues. Raises on rework limit."""
|
True if done (COMPLETED), False if rework continues. Raises on rework limit.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
degraded: True 表示验收走了降级路径(LLM 不可用/超时/异常时自动通过),
|
||||||
|
广播到 ``review_result`` 事件 payload 让前端/运维可编程判断。
|
||||||
|
"""
|
||||||
if passed:
|
if passed:
|
||||||
phase.status = PhaseStatus.COMPLETED
|
phase.status = PhaseStatus.COMPLETED
|
||||||
# P2: SharedWorkspace 写入移到验收通过后 — 避免持久化被拒输出
|
# P2: SharedWorkspace 写入移到验收通过后 — 避免持久化被拒输出
|
||||||
|
|
@ -276,6 +287,7 @@ class PhaseExecutorMixin:
|
||||||
await self._broadcast_event("review_result", {
|
await self._broadcast_event("review_result", {
|
||||||
"phase_id": phase.id, "phase_name": phase.name, "passed": True,
|
"phase_id": phase.id, "phase_name": phase.name, "passed": True,
|
||||||
"feedback": feedback, "expert": phase.assigned_expert,
|
"feedback": feedback, "expert": phase.assigned_expert,
|
||||||
|
"degraded": degraded,
|
||||||
})
|
})
|
||||||
if phase.collaboration_contracts:
|
if phase.collaboration_contracts:
|
||||||
await self._notify_collaborators(phase, plan)
|
await self._notify_collaborators(phase, plan)
|
||||||
|
|
@ -288,7 +300,7 @@ class PhaseExecutorMixin:
|
||||||
})
|
})
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# 验收不合格 — 返工或标记失败
|
# 验收不合格 — 返工或标记失败(degraded 路径不应走到这里,但保持字段一致)
|
||||||
phase.rework_count += 1
|
phase.rework_count += 1
|
||||||
phase.review_feedback = feedback
|
phase.review_feedback = feedback
|
||||||
|
|
||||||
|
|
@ -304,6 +316,7 @@ class PhaseExecutorMixin:
|
||||||
"expert": phase.assigned_expert,
|
"expert": phase.assigned_expert,
|
||||||
"rework_count": phase.rework_count,
|
"rework_count": phase.rework_count,
|
||||||
"final_status": "failed",
|
"final_status": "failed",
|
||||||
|
"degraded": degraded,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
await self._broadcast_event(
|
await self._broadcast_event(
|
||||||
|
|
@ -329,6 +342,7 @@ class PhaseExecutorMixin:
|
||||||
"expert": phase.assigned_expert,
|
"expert": phase.assigned_expert,
|
||||||
"rework_count": phase.rework_count,
|
"rework_count": phase.rework_count,
|
||||||
"final_status": "rework",
|
"final_status": "rework",
|
||||||
|
"degraded": degraded,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
feedback_truncated = feedback[:500] if feedback else ""
|
feedback_truncated = feedback[:500] if feedback else ""
|
||||||
|
|
@ -377,7 +391,8 @@ class PhaseExecutorMixin:
|
||||||
agent = await pool.create_agent(temp_config)
|
agent = await pool.create_agent(temp_config)
|
||||||
self._temp_agents[phase.id] = temp_config.name
|
self._temp_agents[phase.id] = temp_config.name
|
||||||
return agent
|
return agent
|
||||||
except Exception as e:
|
except (ValueError, KeyError, RuntimeError, TypeError) as e:
|
||||||
|
# pool.create_agent 失败:config 校验/工具注册/依赖缺失等
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Failed to create isolated agent for phase {phase.id}, "
|
f"Failed to create isolated agent for phase {phase.id}, "
|
||||||
f"using expert's existing agent: {e}"
|
f"using expert's existing agent: {e}"
|
||||||
|
|
@ -393,5 +408,7 @@ class PhaseExecutorMixin:
|
||||||
if temp_name:
|
if temp_name:
|
||||||
try:
|
try:
|
||||||
await pool.remove_agent(temp_name)
|
await pool.remove_agent(temp_name)
|
||||||
except Exception as e:
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
|
except (KeyError, RuntimeError) as e:
|
||||||
logger.warning(f"Failed to clean up isolated agent '{temp_name}': {e}")
|
logger.warning(f"Failed to clean up isolated agent '{temp_name}': {e}")
|
||||||
|
|
|
||||||
|
|
@ -5,11 +5,15 @@
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
from dataclasses import dataclass
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
from agentkit.core.exceptions import LLMProviderError
|
||||||
|
|
||||||
from .expert import Expert
|
from .expert import Expert
|
||||||
from .plan import PlanPhase
|
from .plan import PlanPhase
|
||||||
|
|
||||||
|
|
@ -19,27 +23,46 @@ logger = logging.getLogger(__name__)
|
||||||
_RISK_FLAG_RE = re.compile(r"\[RISK:\s*(.+?)\]", re.DOTALL)
|
_RISK_FLAG_RE = re.compile(r"\[RISK:\s*(.+?)\]", re.DOTALL)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ReviewResult:
|
||||||
|
"""Lead 验收阶段输出的结构化结果(U3)。
|
||||||
|
|
||||||
|
替换原先的 ``tuple[bool, str]`` 返回值,让降级状态可被调用方/前端
|
||||||
|
可编程判断,而非依赖 ``[DEGRADED]`` 字符串前缀匹配。
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
passed: 验收是否通过(True=通过,False=需返工)
|
||||||
|
degraded: 是否处于降级路径(LLM 不可用/超时/异常时自动通过)
|
||||||
|
feedback: 验收反馈;降级时为降级原因,正常通过时为空,需返工时为修改要求
|
||||||
|
"""
|
||||||
|
|
||||||
|
passed: bool
|
||||||
|
degraded: bool = False
|
||||||
|
feedback: str = ""
|
||||||
|
|
||||||
|
|
||||||
class ReviewGateMixin:
|
class ReviewGateMixin:
|
||||||
"""Mixin: Lead 验收阶段输出质量 + 解析风险标记。由 TeamOrchestrator 组合。"""
|
"""Mixin: Lead 验收阶段输出质量 + 解析风险标记。由 TeamOrchestrator 组合。"""
|
||||||
|
|
||||||
async def _review_phase_output(
|
async def _review_phase_output(
|
||||||
self, lead: Expert, phase: PlanPhase, result: dict[str, Any]
|
self, lead: Expert, phase: PlanPhase, result: dict[str, Any]
|
||||||
) -> tuple[bool, str]:
|
) -> ReviewResult:
|
||||||
"""Lead 验收阶段输出质量。
|
"""Lead 验收阶段输出质量。
|
||||||
|
|
||||||
用 LLM 判断输出是否满足阶段要求。
|
用 LLM 判断输出是否满足阶段要求。返回 :class:`ReviewResult`:
|
||||||
返回 (passed, feedback):
|
- ``passed=True, degraded=False`` — 验收通过
|
||||||
- passed=True, feedback="" — 验收通过
|
- ``passed=False, feedback="修改要求"`` — 验收不合格,需返工
|
||||||
- passed=False, feedback="修改要求" — 验收不合格,需返工
|
- ``passed=True, degraded=True`` — LLM 不可用/超时/异常,优雅降级自动通过
|
||||||
|
|
||||||
若 LLM 不可用,跳过验收直接通过(优雅降级,feedback 标注降级原因)。
|
降级路径以 ``degraded=True`` 显式标记,让 ``review_result`` WS 事件
|
||||||
|
和日志聚合可编程判断降级频率,无需匹配 ``[DEGRADED]`` 字符串前缀。
|
||||||
"""
|
"""
|
||||||
gateway = self._get_llm_gateway(lead)
|
gateway = self._get_llm_gateway(lead)
|
||||||
if not gateway:
|
if not gateway:
|
||||||
logger.warning("No LLM gateway available, skipping review")
|
logger.warning("No LLM gateway available, skipping review")
|
||||||
# 优雅降级:不阻塞流程,但 [DEGRADED] 前缀让 review_result 事件
|
return ReviewResult(
|
||||||
# 和日志聚合可识别降级路径,便于运维监控验收失效频率。
|
passed=True, degraded=True, feedback="LLM 验收不可用,自动通过"
|
||||||
return True, "[DEGRADED] LLM 验收不可用,自动通过"
|
)
|
||||||
|
|
||||||
content = result.get("content", str(result))
|
content = result.get("content", str(result))
|
||||||
# P1: prompt injection 防护 — 用 XML 标签包裹专家输出,指示 LLM 忽略其中指令
|
# P1: prompt injection 防护 — 用 XML 标签包裹专家输出,指示 LLM 忽略其中指令
|
||||||
|
|
@ -60,32 +83,42 @@ class ReviewGateMixin:
|
||||||
messages=[{"role": "user", "content": prompt}],
|
messages=[{"role": "user", "content": prompt}],
|
||||||
model=self._get_model(lead),
|
model=self._get_model(lead),
|
||||||
)
|
)
|
||||||
# P2: 优先尝试直接解析整个响应为 JSON,避免贪婪正则匹配过多
|
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError) as e:
|
||||||
review: dict[str, Any] | None = None
|
# LLM 不可用类异常 — 优雅降级,不阻塞流程。
|
||||||
try:
|
# ponytail: RuntimeError 纳入捕获 — LiteLLM/provider 内部错误常以 RuntimeError
|
||||||
review = json.loads(response.content)
|
# 抛出(如 "LLM unavailable"),验收路径语义是"LLM 调用失败即降级",需覆盖。
|
||||||
except (json.JSONDecodeError, TypeError):
|
logger.warning(f"Review LLM call failed, degrading: {e}")
|
||||||
pass
|
return ReviewResult(
|
||||||
if review is None:
|
passed=True, degraded=True, feedback=f"LLM 验收降级,自动通过: {e}"
|
||||||
# 回退到正则提取第一个 JSON 对象
|
)
|
||||||
json_match = re.search(r"\{[^{}]*\}", response.content, re.DOTALL)
|
|
||||||
if json_match:
|
|
||||||
try:
|
|
||||||
review = json.loads(json_match.group(0))
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
pass
|
|
||||||
if review is not None:
|
|
||||||
# ponytail: 显式比较避免 bool("false") == True 陷阱
|
|
||||||
passed_raw = review.get("passed", True)
|
|
||||||
passed = passed_raw is True or str(passed_raw).lower() == "true"
|
|
||||||
feedback = review.get("feedback", "")
|
|
||||||
return passed, str(feedback)
|
|
||||||
logger.warning(f"Review LLM returned unparseable response: {response.content[:200]}")
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f"Review LLM call failed: {e}")
|
|
||||||
|
|
||||||
# 降级:不阻塞流程,但 [DEGRADED] 前缀让 review_result 事件可识别降级路径
|
# P2: 优先尝试直接解析整个响应为 JSON,避免贪婪正则匹配过多
|
||||||
return True, "[DEGRADED] LLM 验收降级,自动通过"
|
review: dict[str, Any] | None = None
|
||||||
|
try:
|
||||||
|
review = json.loads(response.content)
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
pass
|
||||||
|
if review is None:
|
||||||
|
# 回退到正则提取第一个 JSON 对象
|
||||||
|
json_match = re.search(r"\{[^{}]*\}", response.content, re.DOTALL)
|
||||||
|
if json_match:
|
||||||
|
try:
|
||||||
|
review = json.loads(json_match.group(0))
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
if review is not None:
|
||||||
|
# ponytail: 显式比较避免 bool("false") == True 陷阱
|
||||||
|
passed_raw = review.get("passed", True)
|
||||||
|
passed = passed_raw is True or str(passed_raw).lower() == "true"
|
||||||
|
feedback = review.get("feedback", "")
|
||||||
|
return ReviewResult(passed=passed, feedback=str(feedback))
|
||||||
|
|
||||||
|
# 现有行为:LLM 返回不可解析响应时也走降级通过(plan 文档 line 274 标注
|
||||||
|
# passed=False,但实际生产行为是降级通过避免阻塞流水线 — 以现有行为为准)。
|
||||||
|
logger.warning(f"Review LLM returned unparseable response: {response.content[:200]}")
|
||||||
|
return ReviewResult(
|
||||||
|
passed=True, degraded=True, feedback="LLM 验收响应不可解析,自动通过"
|
||||||
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _parse_risk_flags(content: str) -> list[str]:
|
def _parse_risk_flags(content: str) -> list[str]:
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,7 @@ import logging
|
||||||
import re
|
import re
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
from agentkit.core.exceptions import LLMProviderError
|
||||||
from agentkit.llm.gateway import LLMGateway
|
from agentkit.llm.gateway import LLMGateway
|
||||||
|
|
||||||
from ._debate_runner import DebateRunnerMixin
|
from ._debate_runner import DebateRunnerMixin
|
||||||
|
|
@ -169,7 +170,7 @@ class TeamOrchestrator(
|
||||||
if self._checkpoint is not None:
|
if self._checkpoint is not None:
|
||||||
try:
|
try:
|
||||||
await self._checkpoint.save_plan(plan)
|
await self._checkpoint.save_plan(plan)
|
||||||
except Exception as e:
|
except (ConnectionError, OSError, asyncio.TimeoutError, RuntimeError, ValueError, KeyError) as e:
|
||||||
logger.warning(f"Checkpoint save_plan failed: {e}")
|
logger.warning(f"Checkpoint save_plan failed: {e}")
|
||||||
|
|
||||||
# 4. Set EXECUTING status, execute phases
|
# 4. Set EXECUTING status, execute phases
|
||||||
|
|
@ -266,7 +267,7 @@ class TeamOrchestrator(
|
||||||
if should_save_checkpoint and self._checkpoint is not None:
|
if should_save_checkpoint and self._checkpoint is not None:
|
||||||
try:
|
try:
|
||||||
await self._checkpoint.save(plan.id, ph, plan.status.value)
|
await self._checkpoint.save(plan.id, ph, plan.status.value)
|
||||||
except Exception as e:
|
except (ConnectionError, OSError, asyncio.TimeoutError, RuntimeError, ValueError, KeyError) as e:
|
||||||
logger.warning(f"Checkpoint save failed for phase {ph.id}: {e}")
|
logger.warning(f"Checkpoint save failed for phase {ph.id}: {e}")
|
||||||
|
|
||||||
# U3: Divergence detection — check completed phases for conflicts
|
# U3: Divergence detection — check completed phases for conflicts
|
||||||
|
|
@ -310,7 +311,7 @@ class TeamOrchestrator(
|
||||||
if self._checkpoint is not None:
|
if self._checkpoint is not None:
|
||||||
try:
|
try:
|
||||||
await self._checkpoint.clear(plan.id)
|
await self._checkpoint.clear(plan.id)
|
||||||
except Exception as e:
|
except (ConnectionError, OSError, asyncio.TimeoutError, RuntimeError, ValueError, KeyError) as e:
|
||||||
logger.warning(f"Checkpoint clear failed: {e}")
|
logger.warning(f"Checkpoint clear failed: {e}")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|
@ -326,7 +327,9 @@ class TeamOrchestrator(
|
||||||
plan.status = PlanStatus.FAILED
|
plan.status = PlanStatus.FAILED
|
||||||
await self._broadcast_event("team_dissolved", {"team_id": self._team.team_id})
|
await self._broadcast_event("team_dissolved", {"team_id": self._team.team_id})
|
||||||
return await self._fallback_to_single_agent(task, plan, phase_results)
|
return await self._fallback_to_single_agent(task, plan, phase_results)
|
||||||
except Exception as e:
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
|
except (RuntimeError, ValueError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError, LLMProviderError) as e:
|
||||||
logger.error(f"Pipeline execution failed: {e}")
|
logger.error(f"Pipeline execution failed: {e}")
|
||||||
plan.status = PlanStatus.FAILED
|
plan.status = PlanStatus.FAILED
|
||||||
await self._broadcast_event("team_dissolved", {"team_id": self._team.team_id})
|
await self._broadcast_event("team_dissolved", {"team_id": self._team.team_id})
|
||||||
|
|
@ -463,7 +466,7 @@ class TeamOrchestrator(
|
||||||
if phases:
|
if phases:
|
||||||
return phases
|
return phases
|
||||||
logger.warning("LLM decomposition returned no valid phases")
|
logger.warning("LLM decomposition returned no valid phases")
|
||||||
except Exception as e:
|
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, json.JSONDecodeError, ValueError, TypeError) as e:
|
||||||
logger.warning(f"LLM task decomposition failed: {e}")
|
logger.warning(f"LLM task decomposition failed: {e}")
|
||||||
|
|
||||||
return [PlanPhase(name="执行", assigned_expert=lead.config.name, task_description=task)]
|
return [PlanPhase(name="执行", assigned_expert=lead.config.name, task_description=task)]
|
||||||
|
|
@ -588,5 +591,5 @@ class TeamOrchestrator(
|
||||||
await self._team.handoff_transport.send(
|
await self._team.handoff_transport.send(
|
||||||
self._team.team_channel, {"type": event_type, **data}
|
self._team.team_channel, {"type": event_type, **data}
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except (ConnectionError, RuntimeError, OSError, asyncio.TimeoutError) as e:
|
||||||
logger.warning(f"Failed to broadcast event '{event_type}': {e}")
|
logger.warning(f"Failed to broadcast event '{event_type}': {e}")
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,7 @@ from agentkit.orchestrator.pipeline_schema import (
|
||||||
StageStatus,
|
StageStatus,
|
||||||
)
|
)
|
||||||
from agentkit.orchestrator.reflection import PipelineReflector, PipelineReplanner
|
from agentkit.orchestrator.reflection import PipelineReflector, PipelineReplanner
|
||||||
from agentkit.orchestrator.retry import StepRetryPolicy, execute_with_retry
|
from agentkit.orchestrator.retry import execute_with_retry
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -143,7 +143,7 @@ class PipelineEngine:
|
||||||
steps=step_names,
|
steps=step_names,
|
||||||
input_data=context,
|
input_data=context,
|
||||||
)
|
)
|
||||||
except Exception as exc:
|
except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as exc:
|
||||||
logger.warning(f"Failed to create execution state: {exc}")
|
logger.warning(f"Failed to create execution state: {exc}")
|
||||||
|
|
||||||
# Create Saga orchestrator for compensation tracking
|
# Create Saga orchestrator for compensation tracking
|
||||||
|
|
@ -183,7 +183,7 @@ class PipelineEngine:
|
||||||
output=step_output,
|
output=step_output,
|
||||||
error=step_error,
|
error=step_error,
|
||||||
)
|
)
|
||||||
except Exception as exc:
|
except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as exc:
|
||||||
logger.warning(f"Failed to update step state: {exc}")
|
logger.warning(f"Failed to update step state: {exc}")
|
||||||
|
|
||||||
# 收集输出变量
|
# 收集输出变量
|
||||||
|
|
@ -219,7 +219,7 @@ class PipelineEngine:
|
||||||
step_name=stage.name,
|
step_name=stage.name,
|
||||||
error=result.error_message,
|
error=result.error_message,
|
||||||
)
|
)
|
||||||
except Exception as exc:
|
except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as exc:
|
||||||
logger.warning(f"Failed to persist failure state: {exc}")
|
logger.warning(f"Failed to persist failure state: {exc}")
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
@ -237,7 +237,7 @@ class PipelineEngine:
|
||||||
execution_id=execution_id,
|
execution_id=execution_id,
|
||||||
final_output=final_output,
|
final_output=final_output,
|
||||||
)
|
)
|
||||||
except Exception as exc:
|
except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as exc:
|
||||||
logger.warning(f"Failed to persist completion state: {exc}")
|
logger.warning(f"Failed to persist completion state: {exc}")
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
@ -346,7 +346,11 @@ class PipelineEngine:
|
||||||
|
|
||||||
return sr
|
return sr
|
||||||
|
|
||||||
except Exception as e:
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
|
|
||||||
|
except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as e:
|
||||||
|
# dispatcher / agent 执行失败 — 转 StageResult.FAILED 不向上抛
|
||||||
return StageResult(
|
return StageResult(
|
||||||
stage_name=stage.name,
|
stage_name=stage.name,
|
||||||
status=StageStatus.FAILED,
|
status=StageStatus.FAILED,
|
||||||
|
|
@ -475,7 +479,9 @@ class PipelineEngine:
|
||||||
stage,
|
stage,
|
||||||
started_at,
|
started_at,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
|
except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as e:
|
||||||
logger.error(f"Verifier execution failed for stage '{stage.name}': {e}")
|
logger.error(f"Verifier execution failed for stage '{stage.name}': {e}")
|
||||||
return StageResult(
|
return StageResult(
|
||||||
stage_name=stage.name,
|
stage_name=stage.name,
|
||||||
|
|
@ -619,7 +625,9 @@ class PipelineEngine:
|
||||||
step_name=stage.name,
|
step_name=stage.name,
|
||||||
)
|
)
|
||||||
return sr
|
return sr
|
||||||
except Exception as e:
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
|
except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as e:
|
||||||
return StageResult(
|
return StageResult(
|
||||||
stage_name=stage.name,
|
stage_name=stage.name,
|
||||||
status=StageStatus.FAILED,
|
status=StageStatus.FAILED,
|
||||||
|
|
@ -679,7 +687,7 @@ class PipelineEngine:
|
||||||
score=output_data.get("score", 0.0),
|
score=output_data.get("score", 0.0),
|
||||||
)
|
)
|
||||||
return feedback
|
return feedback
|
||||||
except Exception as e:
|
except (TypeError, KeyError, ValueError) as e:
|
||||||
# 解析失败时直接抛出异常,避免死循环
|
# 解析失败时直接抛出异常,避免死循环
|
||||||
logger.error(f"Failed to parse verifier output: {e}")
|
logger.error(f"Failed to parse verifier output: {e}")
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
|
|
|
||||||
|
|
@ -790,10 +790,18 @@ class TestResultSynthesis:
|
||||||
{"name": "A", "assigned_expert": "member1", "task_description": "阶段A", "depends_on": []},
|
{"name": "A", "assigned_expert": "member1", "task_description": "阶段A", "depends_on": []},
|
||||||
{"name": "B", "assigned_expert": "member2", "task_description": "阶段B", "depends_on": []},
|
{"name": "B", "assigned_expert": "member2", "task_description": "阶段B", "depends_on": []},
|
||||||
])
|
])
|
||||||
# Synthesis call raises to force concatenation fallback
|
# ponytail: 函数式 side_effect — 首次返回 decomposition,后续一律抛 RuntimeError
|
||||||
gateway.chat = AsyncMock(
|
# (列表式 side_effect 耗尽会抛 StopIteration,被 U3 收窄后的 except 漏捕获;
|
||||||
side_effect=[decomp_response, RuntimeError("LLM unavailable")]
|
# 函数式让"LLM 不可用"语义明确,覆盖验收+综合所有后续调用)
|
||||||
)
|
call_count = [0]
|
||||||
|
|
||||||
|
async def chat_side_effect(messages, model=None, **kwargs):
|
||||||
|
call_count[0] += 1
|
||||||
|
if call_count[0] == 1:
|
||||||
|
return decomp_response
|
||||||
|
raise RuntimeError("LLM unavailable")
|
||||||
|
|
||||||
|
gateway.chat = AsyncMock(side_effect=chat_side_effect)
|
||||||
team._experts["lead"].agent._llm_gateway = gateway
|
team._experts["lead"].agent._llm_gateway = gateway
|
||||||
|
|
||||||
result = await orchestrator.execute("复杂任务")
|
result = await orchestrator.execute("复杂任务")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue