refactor: systematic tech debt cleanup (U1-U5) #8

Merged
fischer merged 7 commits from refactor/react-engine-unified-loop into main 2026-07-01 00:45:35 +08:00
11 changed files with 256 additions and 149 deletions
Showing only changes of commit be5c4e09f8 - Show all commits

View File

@ -246,7 +246,7 @@ class BaseAgent(ABC):
self._redis = aioredis.from_url(redis_url, decode_responses=True)
await self._redis.ping()
logger.info(f"Agent '{self.name}' connected to Redis")
except Exception as e:
except (ConnectionError, OSError, asyncio.TimeoutError, ValueError) as e:
self._redis = None
logger.warning(
f"Agent '{self.name}' Redis unavailable: {e}, falling back to local mode"
@ -380,7 +380,10 @@ class BaseAgent(ABC):
# 失败钩子
try:
await self.on_task_failed(task, TaskCancelledError(task.task_id))
except asyncio.CancelledError:
raise
except Exception as hook_err:
# 用户提供的 hook — 任意异常都可能,不阻塞 TaskResult 构建
logger.error(f"on_task_failed hook error: {hook_err}")
elapsed = time.monotonic() - start_time
@ -408,7 +411,10 @@ class BaseAgent(ABC):
await self.on_task_failed(
task, TaskTimeoutError(task.task_id, task.timeout_seconds)
)
except asyncio.CancelledError:
raise
except Exception as hook_err:
# 用户提供的 hook — 任意异常都可能,不阻塞 TaskResult 构建
logger.error(f"on_task_failed hook error: {hook_err}")
elapsed = time.monotonic() - start_time
@ -427,12 +433,20 @@ class BaseAgent(ABC):
},
)
except asyncio.CancelledError:
# CancelledError 必须传播,不被 except Exception 吞掉
raise
except Exception as e:
# 框架边界 catch-allhandle_task 是用户实现,可能抛任意异常;
# execute() 契约要求始终返回 TaskResult故保留兜底。
logger.error(f"Agent '{self.name}' task {task.task_id} failed: {e}")
# 失败钩子
try:
await self.on_task_failed(task, e)
except asyncio.CancelledError:
raise
except Exception as hook_err:
logger.error(f"on_task_failed hook error: {hook_err}")
@ -517,13 +531,13 @@ class BaseAgent(ABC):
f"agent:{self.name}:progress",
json.dumps(progress_obj.to_dict()),
)
except Exception as e:
except (ConnectionError, asyncio.TimeoutError, OSError) as e:
logger.warning(f"Failed to publish progress for task {task_id}: {e}")
if self._dispatcher is not None:
try:
await self._dispatcher.handle_progress(progress_obj)
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, RuntimeError) as e:
logger.warning(
f"Failed to report progress to dispatcher for task {task_id}: {e}"
)
@ -544,7 +558,7 @@ class BaseAgent(ABC):
await asyncio.sleep(30)
except asyncio.CancelledError:
pass
except Exception as e:
except (ConnectionError, asyncio.TimeoutError, OSError, RuntimeError) as e:
logger.error(f"Heartbeat error for agent '{self.name}': {e}")
async def _listen_for_tasks(self):
@ -565,11 +579,11 @@ class BaseAgent(ABC):
task_data = json.loads(task_json)
task = TaskMessage.from_dict(task_data)
asyncio.create_task(self._execute_task_with_semaphore(task))
except Exception as e:
except (json.JSONDecodeError, KeyError, TypeError, ValueError) as e:
logger.error(f"Failed to parse task message: {e}")
except asyncio.CancelledError:
pass
except Exception as e:
except (ConnectionError, asyncio.TimeoutError, OSError, RuntimeError) as e:
logger.error(f"Task listener error for agent '{self.name}': {e}")
async def _execute_task_with_semaphore(self, task: TaskMessage):
@ -593,7 +607,13 @@ class BaseAgent(ABC):
if self._redis is not None and self._dispatcher is not None:
await self._dispatcher.handle_result(result)
except asyncio.CancelledError:
# CancelledError 必须传播,不被 except 吞掉
raise
except Exception as e:
# 兜底execute() 内部已捕获大部分异常并返回 TaskResult
# 此处仅捕获 dispatcher 失败或 execute() 边界外的异常
logger.error(f"Agent '{self.name}' task {task.task_id} failed: {e}")
error_result = TaskResult(
task_id=task.task_id,
@ -622,5 +642,6 @@ class BaseAgent(ABC):
jsonschema.validate(data, schema)
except ImportError:
logger.warning("jsonschema not installed, skipping input validation")
except Exception as e:
except (ValueError, TypeError, KeyError) as e:
# jsonschema.ValidationError 继承 ValueError其余为 schema/data 类型错误
raise SchemaValidationError(self.name, str(e))

View File

@ -3,6 +3,7 @@
与业务系统解耦通过依赖注入获取 Redis 连接和数据库会话
"""
import asyncio
import ipaddress
import json
import logging
@ -12,7 +13,6 @@ from typing import Any, Callable, Awaitable
from urllib.parse import urlparse
from agentkit.core.exceptions import (
NoAvailableAgentError,
TaskDispatchError,
TaskNotFoundError,
)
@ -51,7 +51,7 @@ def _validate_callback_url(url: str) -> bool:
"""
try:
parsed = urlparse(url)
except Exception:
except (ValueError, TypeError):
return False
if parsed.scheme not in ("http", "https"):
@ -159,7 +159,7 @@ class TaskDispatcher:
except TaskDispatchError:
raise
except Exception as e:
except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError) as e:
await db.rollback()
logger.error(f"Failed to dispatch task {task.task_id}: {e}")
raise TaskDispatchError(task.task_id, str(e))
@ -197,7 +197,7 @@ class TaskDispatcher:
except TaskNotFoundError:
raise
except Exception as e:
except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError) as e:
await db.rollback()
logger.error(f"Failed to cancel task {task_id}: {e}")
raise
@ -263,7 +263,7 @@ class TaskDispatcher:
logger.info(f"Task {result.task_id} result handled (status={result.status})")
except Exception as e:
except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError) as e:
await db.rollback()
logger.error(f"Failed to handle result for task {result.task_id}: {e}")
@ -295,7 +295,7 @@ class TaskDispatcher:
)
await db.commit()
except Exception as e:
except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError) as e:
await db.rollback()
logger.error(f"Failed to handle progress for task {progress.task_id}: {e}")
@ -359,7 +359,7 @@ class TaskDispatcher:
if retried > 0:
logger.info(f"Retried {retried} failed tasks")
except Exception as e:
except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError) as e:
await db.rollback()
logger.error(f"Failed to retry failed tasks: {e}")
@ -392,7 +392,7 @@ class TaskDispatcher:
async with httpx.AsyncClient(timeout=10) as client:
await client.post(callback_url, json=result.to_dict())
logger.info(f"Callback triggered for task {result.task_id}")
except Exception as e:
except (ConnectionError, OSError, asyncio.TimeoutError, RuntimeError) as e:
logger.warning(f"Callback failed for task {result.task_id}: {e}")
def _task_to_dict(self, task: Any) -> dict:

View File

@ -12,7 +12,8 @@ from dataclasses import dataclass, field
from enum import Enum
from typing import TYPE_CHECKING, Any
from agentkit.core.protocol import TaskMessage, TaskResult, TaskStatus
from agentkit.core.exceptions import LLMProviderError
from agentkit.core.protocol import TaskMessage, TaskStatus
from agentkit.core.shared_workspace import SharedWorkspace
if TYPE_CHECKING:
@ -224,7 +225,7 @@ class Orchestrator:
subtasks=subtasks,
parallel_groups=parallel_groups,
)
except Exception as e:
except (RuntimeError, ValueError, KeyError, AttributeError) as e:
logger.warning(f"GoalPlanner decomposition failed, falling back: {e}")
# If LLM gateway available, use it for decomposition
@ -239,7 +240,7 @@ class Orchestrator:
subtasks=subtasks,
parallel_groups=parallel_groups,
)
except Exception as e:
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, ValueError, TypeError, KeyError) as e:
logger.warning(f"LLM decomposition failed, falling back to simple: {e}")
# Fallback: single subtask = original task
@ -418,7 +419,7 @@ class Orchestrator:
"status": "completed",
},
))
except Exception as e:
except (ConnectionError, RuntimeError, OSError) as e:
logger.warning(f"Failed to publish progress via MessageBus: {e}")
return output
@ -437,10 +438,12 @@ class Orchestrator:
"error": "Subtask timed out",
},
))
except Exception as e:
except (ConnectionError, RuntimeError, OSError) as e:
logger.warning(f"Failed to publish progress via MessageBus: {e}")
return error_result
except Exception as e:
except asyncio.CancelledError:
raise
except (RuntimeError, ValueError, KeyError, AttributeError, ConnectionError, LLMProviderError) as e:
error_result = {"status": "failed", "error": str(e)}
if self._message_bus is not None:
try:
@ -455,7 +458,7 @@ class Orchestrator:
"error": str(e),
},
))
except Exception as e:
except (ConnectionError, RuntimeError, OSError) as e:
logger.warning(f"Failed to publish progress via MessageBus: {e}")
return error_result
@ -513,7 +516,7 @@ class Orchestrator:
try:
agents_info = self._agent_pool.list_agents()
return [a["name"] for a in agents_info]
except Exception:
except (RuntimeError, KeyError, AttributeError):
return []
def _convert_execution_plan_to_subtasks(
@ -561,7 +564,7 @@ class Orchestrator:
description = agent.get("description", "").lower()
if skill.lower() in name.lower() or skill.lower() in agent_type.lower() or skill.lower() in description:
return name
except Exception:
except (RuntimeError, KeyError, AttributeError):
pass
return None
@ -580,9 +583,6 @@ class Orchestrator:
Returns:
OrchestrationResult: 编排结果metadata 中包含迭代历史
"""
import time as _time
start_time = _time.monotonic()
iteration_history: list[dict[str, Any]] = []
# First execution
@ -650,7 +650,7 @@ class Orchestrator:
try:
return await self._llm_evaluate(task, result)
except Exception as e:
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, ValueError, RuntimeError) as e:
logger.warning(f"LLM evaluation failed, falling back to rule-based: {e}")
return self._rule_based_evaluate(result)

View File

@ -18,7 +18,7 @@ from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import TYPE_CHECKING, Any, Awaitable, Callable
from agentkit.core.exceptions import TaskCancelledError, TaskTimeoutError
from agentkit.core.exceptions import LLMProviderError, TaskCancelledError, TaskTimeoutError
from agentkit.core.goal_planner import GoalPlanner
from agentkit.core.plan_executor import PlanExecutor, PlanExecutionResult
from agentkit.core.plan_schema import ExecutionPlan, PlanStep, PlanStepStatus
@ -214,7 +214,7 @@ class PlanExecEngine:
system_prompt += f"\n\n## 参考信息\n{memory_context}"
else:
system_prompt = f"## 参考信息\n{memory_context}"
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
logger.warning(f"Memory retrieval failed, continuing without context: {e}")
# 启动轨迹记录
@ -440,7 +440,7 @@ class PlanExecEngine:
value={"output_summary": summary, "agent_name": agent_name},
metadata={"task_type": task_type, "outcome": trace_outcome},
)
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, ValueError) as e:
logger.warning(f"Failed to store task result in episodic memory: {e}")
# ------------------------------------------------------------------
@ -477,7 +477,7 @@ class PlanExecEngine:
system_prompt += f"\n\n## 参考信息\n{memory_context}"
else:
system_prompt = f"## 参考信息\n{memory_context}"
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
logger.warning(f"Memory retrieval failed, continuing without context: {e}")
# 启动轨迹记录
@ -514,7 +514,7 @@ class PlanExecEngine:
"goal": plan.goal,
"steps": [s.to_dict() for s in plan.steps],
})
except Exception as e:
except (RuntimeError, ValueError, TypeError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError) as e:
logger.warning(f"Step event callback failed: {e}")
trajectory.append(ReActStep(
@ -535,7 +535,7 @@ class PlanExecEngine:
"goal": spec.goal,
"num_steps": len(spec.steps),
})
except Exception as e:
except (RuntimeError, ValueError, TypeError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError) as e:
logger.warning(f"Step event callback failed: {e}")
if trace_recorder is not None:
@ -604,7 +604,7 @@ class PlanExecEngine:
value={"output_summary": summary, "agent_name": agent_name},
metadata={"task_type": task_type, "outcome": trace_outcome},
)
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, ValueError) as e:
logger.warning(f"Failed to store task result in episodic memory: {e}")
async def _execute_with_replanning(
@ -685,7 +685,7 @@ class PlanExecEngine:
"result": step_result.result,
"error": step_result.error,
})
except Exception as e:
except (RuntimeError, ValueError, TypeError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError) as e:
logger.warning(f"Step event callback failed: {e}")
if trace_recorder is not None:
@ -733,7 +733,7 @@ class PlanExecEngine:
"root_cause": reflection_report.root_cause,
"new_plan_id": current_plan.plan_id,
})
except Exception as e:
except (RuntimeError, ValueError, TypeError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError) as e:
logger.warning(f"Step event callback failed: {e}")
trajectory.append(ReActStep(

View File

@ -15,7 +15,7 @@ from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import TYPE_CHECKING, Any
from agentkit.core.exceptions import LoopDetectedError, TaskCancelledError, TaskTimeoutError
from agentkit.core.exceptions import LLMProviderError, LoopDetectedError, TaskCancelledError, TaskTimeoutError
from agentkit.core.protocol import CancellationToken
from agentkit.llm.gateway import LLMGateway
from agentkit.llm.protocol import LLMResponse
@ -659,7 +659,8 @@ class ReActEngine:
)
or ""
)
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
# 检索层故障RAG/Redis/LLM embedding— 不阻塞主流程
logger.warning(
f"Memory retrieval failed, continuing without context: {e}", exc_info=True
)
@ -679,7 +680,8 @@ class ReActEngine:
if compressor:
try:
conversation = await compressor.compress(conversation)
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
# 压缩器通常调用 LLM — LLM 不可用类异常降级为原对话
logger.warning(
f"Context compression failed, continuing with original messages: {e}"
)
@ -1052,7 +1054,11 @@ class ReActEngine:
approved = await confirmation_handler(
confirmation_id, command, reason
)
except asyncio.CancelledError:
raise
except Exception as e:
# 用户提供的 confirmation_handler — 任意异常都可能,
# 不阻塞主循环,降级为未批准
logger.warning(f"Confirmation handler error: {e}")
if approved:
@ -1066,9 +1072,10 @@ class ReActEngine:
clean_args["_skip_dangerous_check"] = True
try:
tool_result = await tool.safe_execute(**clean_args)
except Exception as e:
except (ToolValidationError, ValueError, TypeError, RuntimeError) as e:
tool_result = {
"error": f"Tool '{tc.name}' execution failed: {e}"
"error": f"Tool '{tc.name}' execution failed: {e}",
"error_code": "tool_execution_failed",
}
else:
clean_args = {
@ -1083,9 +1090,10 @@ class ReActEngine:
if tool
else {"error": f"Tool '{tc.name}' not found"}
)
except Exception as e:
except (ToolValidationError, ValueError, TypeError, RuntimeError) as e:
tool_result = {
"error": f"Tool '{tc.name}' execution failed: {e}"
"error": f"Tool '{tc.name}' execution failed: {e}",
"error_code": "tool_execution_failed",
}
yield ReActEvent(
@ -1146,7 +1154,7 @@ class ReActEngine:
if self._should_compress(conversation, compressor):
try:
conversation = await compressor.compress(conversation)
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
logger.warning(f"Incremental compression failed: {e}")
else:
@ -1217,7 +1225,7 @@ class ReActEngine:
if self._should_compress(conversation, compressor):
try:
conversation = await compressor.compress(conversation)
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
logger.warning(f"Incremental compression failed: {e}")
else:
# ponytail: 检查是否为畸形工具调用(含 <tool_use> 但解析失败)
@ -1332,7 +1340,7 @@ class ReActEngine:
reinjections,
)
break
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
logger.warning(f"Verification loop failed: {e}")
# Yield final_answer event (legacy format for execute_stream consumers)
@ -1428,7 +1436,8 @@ class ReActEngine:
value={"output_summary": summary, "agent_name": agent_name},
metadata={"task_type": task_type, "outcome": trace_outcome},
)
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, ValueError) as e:
# EpisodicMemory 持久化故障PG/Redis— 不影响主结果
logger.warning(f"Failed to store task result in episodic memory: {e}")
async def execute_stream(
@ -1555,7 +1564,7 @@ class ReActEngine:
"""通过 gateway 查询 model 对应的 provider 名。失败回退 None(字符串拼接)。"""
try:
return self._llm_gateway.get_provider_name_for_model(model)
except Exception:
except (AttributeError, KeyError, LLMProviderError):
# ponytail: 测试中 gateway 可能是 MagicMock,无该方法;回退保守路径
return None
@ -1723,7 +1732,7 @@ class ReActEngine:
if compressor and tool_name:
try:
content = await compressor.compress_tool_result(tool_name, result)
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
logger.warning(f"Tool result compression failed for '{tool_name}': {e}")
content = str(result)
return {
@ -1771,10 +1780,11 @@ class ReActEngine:
"error_code": e.error_code,
"details": e.details,
}
except Exception as e:
except (ValueError, TypeError, RuntimeError, asyncio.TimeoutError) as e:
# 工具执行失败 — 记录结构化错误码,LLM 可在下一步调整策略
error_msg = f"Tool '{tool_name}' execution failed: {e}"
logger.warning(error_msg)
return {"error": error_msg}
return {"error": error_msg, "error_code": "tool_execution_failed"}
async def _execute_tool_with_confirmation(
self,
@ -1818,7 +1828,10 @@ class ReActEngine:
if confirmation_handler is not None:
try:
approved = await confirmation_handler(confirmation_id, command, reason)
except asyncio.CancelledError:
raise
except Exception as e:
# 用户提供的 confirmation_handler — 任意异常都可能,不阻塞主循环
logger.warning(f"Confirmation handler error: {e}")
if approved:
@ -1829,8 +1842,11 @@ class ReActEngine:
clean_args["_skip_dangerous_check"] = True
try:
tool_result = await tool.safe_execute(**clean_args)
except Exception as e:
tool_result = {"error": f"Tool '{tc.name}' execution failed: {e}"}
except (ToolValidationError, ValueError, TypeError, RuntimeError) as e:
tool_result = {
"error": f"Tool '{tc.name}' execution failed: {e}",
"error_code": "tool_execution_failed",
}
else:
# Non-dangerous tool: re-execute with skip flag
clean_args = {k: v for k, v in tc.arguments.items() if not k.startswith("_")}
@ -1841,7 +1857,7 @@ class ReActEngine:
if tool
else {"error": f"Tool '{tc.name}' not found"}
)
except Exception as e:
except (ToolValidationError, ValueError, TypeError, RuntimeError) as e:
tool_result = {"error": f"Tool '{tc.name}' execution failed: {e}"}
events.append(

View File

@ -11,23 +11,21 @@ import logging
import re
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import TYPE_CHECKING, Any
from agentkit.core.exceptions import TaskCancelledError, TaskTimeoutError
from agentkit.core.exceptions import LLMProviderError, TaskCancelledError, TaskTimeoutError
from agentkit.core.protocol import CancellationToken
from agentkit.core.react import ReActEngine, ReActEvent, ReActResult, ReActStep
from agentkit.llm.gateway import LLMGateway
from agentkit.llm.protocol import LLMResponse
from agentkit.tools.base import Tool
from agentkit.telemetry.tracing import get_tracer, start_span, _OTEL_AVAILABLE
from agentkit.tools.base import Tool, ToolValidationError
from agentkit.telemetry.tracing import start_span, _OTEL_AVAILABLE
from agentkit.telemetry.metrics import (
agent_request_counter,
agent_duration_histogram,
)
if TYPE_CHECKING:
from agentkit.core.compressor import CompressionStrategy, ContextCompressor
from agentkit.core.compressor import CompressionStrategy
from agentkit.core.trace import TraceRecorder
from agentkit.memory.retriever import MemoryRetriever
@ -296,7 +294,7 @@ class ReWOOEngine:
effective_system_prompt += f"\n\n## 参考信息\n{memory_context}"
else:
effective_system_prompt = f"## 参考信息\n{memory_context}"
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
logger.warning(f"Memory retrieval failed, continuing without context: {e}")
# ── Phase 1: Planning ──
@ -360,7 +358,7 @@ class ReWOOEngine:
if compressor:
try:
llm_messages = await compressor.compress(llm_messages)
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
logger.warning(f"Context compression failed: {e}")
response = await self._llm_gateway.chat(
@ -492,7 +490,7 @@ class ReWOOEngine:
value={"output_summary": summary, "agent_name": agent_name},
metadata={"task_type": task_type, "outcome": trace_outcome},
)
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, ValueError) as e:
logger.warning(f"Failed to store task result in episodic memory: {e}")
return ReActResult(
@ -569,7 +567,7 @@ class ReWOOEngine:
effective_system_prompt += f"\n\n## 参考信息\n{memory_context}"
else:
effective_system_prompt = f"## 参考信息\n{memory_context}"
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
logger.warning(f"Memory retrieval failed, continuing without context: {e}")
trajectory: list[ReActStep] = []
@ -647,7 +645,7 @@ class ReWOOEngine:
if compressor:
try:
llm_messages = await compressor.compress(llm_messages)
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
logger.warning(f"Context compression failed: {e}")
response = await self._llm_gateway.chat(
@ -769,6 +767,9 @@ class ReWOOEngine:
"total_tokens": total_tokens,
},
)
except asyncio.CancelledError:
trace_outcome = "cancelled"
raise
except Exception as e:
trace_outcome = "error"
logger.error(f"ReWOO execute_stream failed: {e}")
@ -786,7 +787,7 @@ class ReWOOEngine:
value={"output_summary": summary, "agent_name": agent_name},
metadata={"task_type": task_type, "outcome": trace_outcome},
)
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, ValueError) as e:
logger.warning(f"Failed to store task result in episodic memory: {e}")
# ── Fallback Strategy Helpers ──────────────────────────
@ -914,7 +915,7 @@ class ReWOOEngine:
output, synthesis_tokens = await self._synthesis_phase(messages=messages, tool_results=tool_results, model=model, agent_name=agent_name, task_type=task_type, system_prompt=effective_system_prompt, compressor=compressor, cancellation_token=cancellation_token)
yield ReActEvent(event_type="final_answer", step=len(plan.steps) + 1, data={"output": output, "total_steps": len(plan.steps) + 1, "total_tokens": simplified_tokens + synthesis_tokens})
return
except Exception as e:
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError, TypeError, ToolValidationError, json.JSONDecodeError) as e:
logger.warning(f"Simplified ReWOO planning also failed in stream mode: {e}")
# Failed, continue to next strategy by not returning
# This signals the caller to try the next strategy
@ -951,7 +952,7 @@ class ReWOOEngine:
):
yield event
return
except Exception as e:
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ToolValidationError) as e:
logger.warning(f"ReAct fallback also failed in stream mode: {e}")
raise _FallbackFailedError("react")
@ -975,13 +976,13 @@ class ReWOOEngine:
if compressor:
try:
direct_messages = await compressor.compress(direct_messages)
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
logger.warning(f"Context compression failed in direct fallback: {e}")
direct_response = await self._llm_gateway.chat(messages=direct_messages, model=model, agent_name=agent_name, task_type=task_type)
output = direct_response.content or ""
yield ReActEvent(event_type="final_answer", step=1, data={"output": output, "total_steps": 1, "total_tokens": total_tokens + direct_response.usage.total_tokens})
return
except Exception as e:
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as e:
logger.error(f"Direct LLM fallback also failed in stream mode: {e}")
raise _FallbackFailedError("direct")
@ -1024,7 +1025,7 @@ class ReWOOEngine:
output, synthesis_tokens = await self._synthesis_phase(messages=messages, tool_results=tool_results, model=model, agent_name=agent_name, task_type=task_type, system_prompt=effective_system_prompt, compressor=compressor, cancellation_token=cancellation_token)
yield ReActEvent(event_type="final_answer", step=len(plan.steps) + 1, data={"output": output, "total_steps": len(plan.steps) + 1, "total_tokens": plan_tokens + synthesis_tokens})
return
except Exception as e:
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError, TypeError, ToolValidationError, json.JSONDecodeError) as e:
logger.warning(f"Plan-exec fallback also failed in stream mode: {e}")
raise _FallbackFailedError("plan_exec")
@ -1178,7 +1179,7 @@ class ReWOOEngine:
total_tokens=total_tokens,
fallback_strategy="simplified_rewoo",
)
except Exception as e:
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError, TypeError, ToolValidationError, json.JSONDecodeError) as e:
logger.warning(f"Simplified ReWOO planning also failed: {e}")
return None
@ -1219,7 +1220,7 @@ class ReWOOEngine:
)
react_result.fallback_strategy = "react"
return react_result
except Exception as e:
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ToolValidationError) as e:
logger.warning(f"ReAct fallback also failed: {e}")
return None
@ -1247,7 +1248,7 @@ class ReWOOEngine:
if compressor:
try:
direct_messages = await compressor.compress(direct_messages)
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
logger.warning(f"Context compression failed in direct fallback: {e}")
direct_response = await self._llm_gateway.chat(
@ -1284,7 +1285,7 @@ class ReWOOEngine:
total_tokens=total_tokens,
fallback_strategy="direct",
)
except Exception as e:
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as e:
logger.error(f"Direct LLM fallback also failed: {e}")
return None
@ -1361,7 +1362,7 @@ class ReWOOEngine:
total_tokens=total_tokens,
fallback_strategy="plan_exec",
)
except Exception as e:
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError, TypeError, ToolValidationError, json.JSONDecodeError) as e:
logger.warning(f"Plan-exec fallback also failed: {e}")
return None
@ -1418,7 +1419,7 @@ class ReWOOEngine:
if compressor:
try:
planning_messages = await compressor.compress(planning_messages)
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
logger.warning(f"Context compression failed during planning: {e}")
try:
@ -1429,7 +1430,7 @@ class ReWOOEngine:
task_type=task_type,
tools=tool_schemas,
)
except Exception as e:
except (LLMProviderError, asyncio.TimeoutError, ConnectionError) as e:
logger.warning(f"LLM call failed during planning: {e}")
return None, 0
@ -1496,7 +1497,7 @@ class ReWOOEngine:
if compressor:
try:
synthesis_messages = await compressor.compress(synthesis_messages)
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
logger.warning(f"Context compression failed during synthesis: {e}")
response = await self._llm_gateway.chat(
@ -1611,7 +1612,7 @@ class ReWOOEngine:
try:
result = await tool.safe_execute(**arguments)
return result
except Exception as e:
except (ToolValidationError, ValueError, TypeError, RuntimeError) as e:
error_msg = f"Tool '{tool_name}' execution failed: {e}"
logger.warning(error_msg)
return {"error": error_msg}

View File

@ -5,6 +5,7 @@
from __future__ import annotations
import asyncio
import copy
import logging
from datetime import datetime, timezone
@ -17,8 +18,6 @@ from .expert import Expert
from .plan import PhaseStatus, PhaseType, PlanPhase, TeamPlan
if TYPE_CHECKING:
import asyncio
from .team import ExpertTeam
logger = logging.getLogger(__name__)
@ -61,7 +60,7 @@ class PhaseExecutorMixin:
full_data = await self._team.workspace.read(ref_key)
if full_data:
return full_data.get("value", content)
except Exception as e:
except (asyncio.TimeoutError, ConnectionError, KeyError, AttributeError) as e:
logger.warning(f"Failed to read offloaded output '{ref_key}': {e}")
return content
@ -80,11 +79,11 @@ class PhaseExecutorMixin:
try:
# U3: 返工循环 — 最多 MAX_REWORKS + 1 次1 次初始 + MAX_REWORKS 次返工)
for _rework_attempt in range(self.MAX_REWORKS + 1):
result, last_error, passed, feedback = await self._run_agent_steps(
result, last_error, passed, feedback, degraded = await self._run_agent_steps(
expert, agent, lead, phase, plan
)
done = await self._finalize_phase(
expert, lead, phase, plan, result, passed, feedback
expert, lead, phase, plan, result, passed, feedback, degraded
)
if done:
return result
@ -181,9 +180,10 @@ class PhaseExecutorMixin:
lead: Expert,
phase: PlanPhase,
plan: TeamPlan,
) -> tuple[dict[str, Any], str | None, bool, str]:
) -> tuple[dict[str, Any], str | None, bool, str, bool]:
"""Run one rework iteration: read deps, build input, execute, review. Returns
(result, last_error, passed, feedback). Raises RuntimeError on retry exhaustion."""
(result, last_error, passed, feedback, degraded). Raises RuntimeError on retry
exhaustion."""
# 每次迭代重新读取依赖输出(前置阶段可能在返工期间完成)
dependency_outputs: dict[str, Any] = {}
for dep_id in phase.depends_on:
@ -228,7 +228,12 @@ class PhaseExecutorMixin:
raise RuntimeError(f"Agent execution failed: {last_error}")
result = task_result.output_data or {"content": ""}
break
except Exception as e:
except asyncio.CancelledError:
# CancelledError 必须传播,不被重试逻辑吞掉
raise
except (RuntimeError, asyncio.TimeoutError, ConnectionError) as e:
# agent.execute() 内部已捕获所有异常并返回 TaskResult
# 此处仅捕获显式抛出的 RuntimeError + 罕见的基础设施异常
last_error = str(e)
if attempt < self.MAX_RETRIES:
logger.info(f"Retrying phase {phase.id} (attempt {attempt + 1})")
@ -250,9 +255,9 @@ class PhaseExecutorMixin:
"risk_description": risk_desc, "phase_id": phase.id, "phase_name": phase.name,
})
# U3: Lead 验收阶段输出
passed, feedback = await self._review_phase_output(lead, phase, result)
return result, last_error, passed, feedback
# U3: Lead 验收阶段输出 — ReviewResult 结构化结果(含 degraded 标记)
review = await self._review_phase_output(lead, phase, result)
return result, last_error, review.passed, review.feedback, review.degraded
async def _finalize_phase(
self,
@ -263,9 +268,15 @@ class PhaseExecutorMixin:
result: dict[str, Any],
passed: bool,
feedback: str,
degraded: bool = False,
) -> bool:
"""Handle review outcome: write workspace + emit completed, or rework/fail. Returns
True if done (COMPLETED), False if rework continues. Raises on rework limit."""
True if done (COMPLETED), False if rework continues. Raises on rework limit.
Args:
degraded: True 表示验收走了降级路径LLM 不可用/超时/异常时自动通过
广播到 ``review_result`` 事件 payload 让前端/运维可编程判断
"""
if passed:
phase.status = PhaseStatus.COMPLETED
# P2: SharedWorkspace 写入移到验收通过后 — 避免持久化被拒输出
@ -276,6 +287,7 @@ class PhaseExecutorMixin:
await self._broadcast_event("review_result", {
"phase_id": phase.id, "phase_name": phase.name, "passed": True,
"feedback": feedback, "expert": phase.assigned_expert,
"degraded": degraded,
})
if phase.collaboration_contracts:
await self._notify_collaborators(phase, plan)
@ -288,7 +300,7 @@ class PhaseExecutorMixin:
})
return True
# 验收不合格 — 返工或标记失败
# 验收不合格 — 返工或标记失败degraded 路径不应走到这里,但保持字段一致)
phase.rework_count += 1
phase.review_feedback = feedback
@ -304,6 +316,7 @@ class PhaseExecutorMixin:
"expert": phase.assigned_expert,
"rework_count": phase.rework_count,
"final_status": "failed",
"degraded": degraded,
},
)
await self._broadcast_event(
@ -329,6 +342,7 @@ class PhaseExecutorMixin:
"expert": phase.assigned_expert,
"rework_count": phase.rework_count,
"final_status": "rework",
"degraded": degraded,
},
)
feedback_truncated = feedback[:500] if feedback else ""
@ -377,7 +391,8 @@ class PhaseExecutorMixin:
agent = await pool.create_agent(temp_config)
self._temp_agents[phase.id] = temp_config.name
return agent
except Exception as e:
except (ValueError, KeyError, RuntimeError, TypeError) as e:
# pool.create_agent 失败config 校验/工具注册/依赖缺失等
logger.warning(
f"Failed to create isolated agent for phase {phase.id}, "
f"using expert's existing agent: {e}"
@ -393,5 +408,7 @@ class PhaseExecutorMixin:
if temp_name:
try:
await pool.remove_agent(temp_name)
except Exception as e:
except asyncio.CancelledError:
raise
except (KeyError, RuntimeError) as e:
logger.warning(f"Failed to clean up isolated agent '{temp_name}': {e}")

View File

@ -5,11 +5,15 @@
from __future__ import annotations
import asyncio
import json
import logging
import re
from dataclasses import dataclass
from typing import Any
from agentkit.core.exceptions import LLMProviderError
from .expert import Expert
from .plan import PlanPhase
@ -19,27 +23,46 @@ logger = logging.getLogger(__name__)
_RISK_FLAG_RE = re.compile(r"\[RISK:\s*(.+?)\]", re.DOTALL)
@dataclass
class ReviewResult:
"""Lead 验收阶段输出的结构化结果U3
替换原先的 ``tuple[bool, str]`` 返回值让降级状态可被调用方/前端
可编程判断而非依赖 ``[DEGRADED]`` 字符串前缀匹配
Attributes:
passed: 验收是否通过True=通过False=需返工
degraded: 是否处于降级路径LLM 不可用/超时/异常时自动通过
feedback: 验收反馈降级时为降级原因正常通过时为空需返工时为修改要求
"""
passed: bool
degraded: bool = False
feedback: str = ""
class ReviewGateMixin:
"""Mixin: Lead 验收阶段输出质量 + 解析风险标记。由 TeamOrchestrator 组合。"""
async def _review_phase_output(
self, lead: Expert, phase: PlanPhase, result: dict[str, Any]
) -> tuple[bool, str]:
) -> ReviewResult:
"""Lead 验收阶段输出质量。
LLM 判断输出是否满足阶段要求
返回 (passed, feedback)
- passed=True, feedback="" 验收通过
- passed=False, feedback="修改要求" 验收不合格需返工
LLM 判断输出是否满足阶段要求返回 :class:`ReviewResult`
- ``passed=True, degraded=False`` 验收通过
- ``passed=False, feedback="修改要求"`` 验收不合格需返工
- ``passed=True, degraded=True`` LLM 不可用/超时/异常优雅降级自动通过
LLM 不可用跳过验收直接通过优雅降级feedback 标注降级原因
降级路径以 ``degraded=True`` 显式标记 ``review_result`` WS 事件
和日志聚合可编程判断降级频率无需匹配 ``[DEGRADED]`` 字符串前缀
"""
gateway = self._get_llm_gateway(lead)
if not gateway:
logger.warning("No LLM gateway available, skipping review")
# 优雅降级:不阻塞流程,但 [DEGRADED] 前缀让 review_result 事件
# 和日志聚合可识别降级路径,便于运维监控验收失效频率。
return True, "[DEGRADED] LLM 验收不可用,自动通过"
return ReviewResult(
passed=True, degraded=True, feedback="LLM 验收不可用,自动通过"
)
content = result.get("content", str(result))
# P1: prompt injection 防护 — 用 XML 标签包裹专家输出,指示 LLM 忽略其中指令
@ -60,6 +83,15 @@ class ReviewGateMixin:
messages=[{"role": "user", "content": prompt}],
model=self._get_model(lead),
)
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError) as e:
# LLM 不可用类异常 — 优雅降级,不阻塞流程。
# ponytail: RuntimeError 纳入捕获 — LiteLLM/provider 内部错误常以 RuntimeError
# 抛出(如 "LLM unavailable"),验收路径语义是"LLM 调用失败即降级",需覆盖。
logger.warning(f"Review LLM call failed, degrading: {e}")
return ReviewResult(
passed=True, degraded=True, feedback=f"LLM 验收降级,自动通过: {e}"
)
# P2: 优先尝试直接解析整个响应为 JSON避免贪婪正则匹配过多
review: dict[str, Any] | None = None
try:
@ -79,13 +111,14 @@ class ReviewGateMixin:
passed_raw = review.get("passed", True)
passed = passed_raw is True or str(passed_raw).lower() == "true"
feedback = review.get("feedback", "")
return passed, str(feedback)
logger.warning(f"Review LLM returned unparseable response: {response.content[:200]}")
except Exception as e:
logger.warning(f"Review LLM call failed: {e}")
return ReviewResult(passed=passed, feedback=str(feedback))
# 降级:不阻塞流程,但 [DEGRADED] 前缀让 review_result 事件可识别降级路径
return True, "[DEGRADED] LLM 验收降级,自动通过"
# 现有行为LLM 返回不可解析响应时也走降级通过plan 文档 line 274 标注
# passed=False但实际生产行为是降级通过避免阻塞流水线 — 以现有行为为准)。
logger.warning(f"Review LLM returned unparseable response: {response.content[:200]}")
return ReviewResult(
passed=True, degraded=True, feedback="LLM 验收响应不可解析,自动通过"
)
@staticmethod
def _parse_risk_flags(content: str) -> list[str]:

View File

@ -16,6 +16,7 @@ import logging
import re
from typing import Any
from agentkit.core.exceptions import LLMProviderError
from agentkit.llm.gateway import LLMGateway
from ._debate_runner import DebateRunnerMixin
@ -169,7 +170,7 @@ class TeamOrchestrator(
if self._checkpoint is not None:
try:
await self._checkpoint.save_plan(plan)
except Exception as e:
except (ConnectionError, OSError, asyncio.TimeoutError, RuntimeError, ValueError, KeyError) as e:
logger.warning(f"Checkpoint save_plan failed: {e}")
# 4. Set EXECUTING status, execute phases
@ -266,7 +267,7 @@ class TeamOrchestrator(
if should_save_checkpoint and self._checkpoint is not None:
try:
await self._checkpoint.save(plan.id, ph, plan.status.value)
except Exception as e:
except (ConnectionError, OSError, asyncio.TimeoutError, RuntimeError, ValueError, KeyError) as e:
logger.warning(f"Checkpoint save failed for phase {ph.id}: {e}")
# U3: Divergence detection — check completed phases for conflicts
@ -310,7 +311,7 @@ class TeamOrchestrator(
if self._checkpoint is not None:
try:
await self._checkpoint.clear(plan.id)
except Exception as e:
except (ConnectionError, OSError, asyncio.TimeoutError, RuntimeError, ValueError, KeyError) as e:
logger.warning(f"Checkpoint clear failed: {e}")
return {
@ -326,7 +327,9 @@ class TeamOrchestrator(
plan.status = PlanStatus.FAILED
await self._broadcast_event("team_dissolved", {"team_id": self._team.team_id})
return await self._fallback_to_single_agent(task, plan, phase_results)
except Exception as e:
except asyncio.CancelledError:
raise
except (RuntimeError, ValueError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError, LLMProviderError) as e:
logger.error(f"Pipeline execution failed: {e}")
plan.status = PlanStatus.FAILED
await self._broadcast_event("team_dissolved", {"team_id": self._team.team_id})
@ -463,7 +466,7 @@ class TeamOrchestrator(
if phases:
return phases
logger.warning("LLM decomposition returned no valid phases")
except Exception as e:
except (LLMProviderError, asyncio.TimeoutError, ConnectionError, json.JSONDecodeError, ValueError, TypeError) as e:
logger.warning(f"LLM task decomposition failed: {e}")
return [PlanPhase(name="执行", assigned_expert=lead.config.name, task_description=task)]
@ -588,5 +591,5 @@ class TeamOrchestrator(
await self._team.handoff_transport.send(
self._team.team_channel, {"type": event_type, **data}
)
except Exception as e:
except (ConnectionError, RuntimeError, OSError, asyncio.TimeoutError) as e:
logger.warning(f"Failed to broadcast event '{event_type}': {e}")

View File

@ -20,7 +20,7 @@ from agentkit.orchestrator.pipeline_schema import (
StageStatus,
)
from agentkit.orchestrator.reflection import PipelineReflector, PipelineReplanner
from agentkit.orchestrator.retry import StepRetryPolicy, execute_with_retry
from agentkit.orchestrator.retry import execute_with_retry
logger = logging.getLogger(__name__)
@ -143,7 +143,7 @@ class PipelineEngine:
steps=step_names,
input_data=context,
)
except Exception as exc:
except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as exc:
logger.warning(f"Failed to create execution state: {exc}")
# Create Saga orchestrator for compensation tracking
@ -183,7 +183,7 @@ class PipelineEngine:
output=step_output,
error=step_error,
)
except Exception as exc:
except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as exc:
logger.warning(f"Failed to update step state: {exc}")
# 收集输出变量
@ -219,7 +219,7 @@ class PipelineEngine:
step_name=stage.name,
error=result.error_message,
)
except Exception as exc:
except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as exc:
logger.warning(f"Failed to persist failure state: {exc}")
return result
@ -237,7 +237,7 @@ class PipelineEngine:
execution_id=execution_id,
final_output=final_output,
)
except Exception as exc:
except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as exc:
logger.warning(f"Failed to persist completion state: {exc}")
return result
@ -346,7 +346,11 @@ class PipelineEngine:
return sr
except Exception as e:
except asyncio.CancelledError:
raise
except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as e:
# dispatcher / agent 执行失败 — 转 StageResult.FAILED 不向上抛
return StageResult(
stage_name=stage.name,
status=StageStatus.FAILED,
@ -475,7 +479,9 @@ class PipelineEngine:
stage,
started_at,
)
except Exception as e:
except asyncio.CancelledError:
raise
except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as e:
logger.error(f"Verifier execution failed for stage '{stage.name}': {e}")
return StageResult(
stage_name=stage.name,
@ -619,7 +625,9 @@ class PipelineEngine:
step_name=stage.name,
)
return sr
except Exception as e:
except asyncio.CancelledError:
raise
except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as e:
return StageResult(
stage_name=stage.name,
status=StageStatus.FAILED,
@ -679,7 +687,7 @@ class PipelineEngine:
score=output_data.get("score", 0.0),
)
return feedback
except Exception as e:
except (TypeError, KeyError, ValueError) as e:
# 解析失败时直接抛出异常,避免死循环
logger.error(f"Failed to parse verifier output: {e}")
raise RuntimeError(

View File

@ -790,10 +790,18 @@ class TestResultSynthesis:
{"name": "A", "assigned_expert": "member1", "task_description": "阶段A", "depends_on": []},
{"name": "B", "assigned_expert": "member2", "task_description": "阶段B", "depends_on": []},
])
# Synthesis call raises to force concatenation fallback
gateway.chat = AsyncMock(
side_effect=[decomp_response, RuntimeError("LLM unavailable")]
)
# ponytail: 函数式 side_effect — 首次返回 decomposition后续一律抛 RuntimeError
# (列表式 side_effect 耗尽会抛 StopIteration被 U3 收窄后的 except 漏捕获;
# 函数式让"LLM 不可用"语义明确,覆盖验收+综合所有后续调用)
call_count = [0]
async def chat_side_effect(messages, model=None, **kwargs):
call_count[0] += 1
if call_count[0] == 1:
return decomp_response
raise RuntimeError("LLM unavailable")
gateway.chat = AsyncMock(side_effect=chat_side_effect)
team._experts["lead"].agent._llm_gateway = gateway
result = await orchestrator.execute("复杂任务")