2026-07-01 00:45:35 +08:00
11 changed files with 256 additions and 149 deletions
--- a/src/agentkit/core/base.py
+++ b/src/agentkit/core/base.py
@ -246,7 +246,7 @@ class BaseAgent(ABC):
                self._redis = aioredis.from_url(redis_url, decode_responses=True)
                await self._redis.ping()
                logger.info(f"Agent '{self.name}' connected to Redis")
-            except Exception as e:
+            except (ConnectionError, OSError, asyncio.TimeoutError, ValueError) as e:
                self._redis = None
                logger.warning(
                    f"Agent '{self.name}' Redis unavailable: {e}, falling back to local mode"
@ -380,7 +380,10 @@ class BaseAgent(ABC):
            # 失败钩子
            try:
                await self.on_task_failed(task, TaskCancelledError(task.task_id))
+            except asyncio.CancelledError:
+                raise
            except Exception as hook_err:
+                # 用户提供的 hook — 任意异常都可能，不阻塞 TaskResult 构建
                logger.error(f"on_task_failed hook error: {hook_err}")

            elapsed = time.monotonic() - start_time
@ -408,7 +411,10 @@ class BaseAgent(ABC):
                await self.on_task_failed(
                    task, TaskTimeoutError(task.task_id, task.timeout_seconds)
                )
+            except asyncio.CancelledError:
+                raise
            except Exception as hook_err:
+                # 用户提供的 hook — 任意异常都可能，不阻塞 TaskResult 构建
                logger.error(f"on_task_failed hook error: {hook_err}")

            elapsed = time.monotonic() - start_time
@ -427,12 +433,20 @@ class BaseAgent(ABC):
                },
            )

+        except asyncio.CancelledError:
+            # CancelledError 必须传播，不被 except Exception 吞掉
+            raise
+
        except Exception as e:
+            # 框架边界 catch-all：handle_task 是用户实现，可能抛任意异常；
+            # execute() 契约要求始终返回 TaskResult，故保留兜底。
            logger.error(f"Agent '{self.name}' task {task.task_id} failed: {e}")

            # 失败钩子
            try:
                await self.on_task_failed(task, e)
+            except asyncio.CancelledError:
+                raise
            except Exception as hook_err:
                logger.error(f"on_task_failed hook error: {hook_err}")

@ -517,13 +531,13 @@ class BaseAgent(ABC):
                    f"agent:{self.name}:progress",
                    json.dumps(progress_obj.to_dict()),
                )
-            except Exception as e:
+            except (ConnectionError, asyncio.TimeoutError, OSError) as e:
                logger.warning(f"Failed to publish progress for task {task_id}: {e}")

            if self._dispatcher is not None:
                try:
                    await self._dispatcher.handle_progress(progress_obj)
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, RuntimeError) as e:
                    logger.warning(
                        f"Failed to report progress to dispatcher for task {task_id}: {e}"
                    )
@ -544,7 +558,7 @@ class BaseAgent(ABC):
                await asyncio.sleep(30)
        except asyncio.CancelledError:
            pass
-        except Exception as e:
+        except (ConnectionError, asyncio.TimeoutError, OSError, RuntimeError) as e:
            logger.error(f"Heartbeat error for agent '{self.name}': {e}")

    async def _listen_for_tasks(self):
@ -565,11 +579,11 @@ class BaseAgent(ABC):
                        task_data = json.loads(task_json)
                        task = TaskMessage.from_dict(task_data)
                        asyncio.create_task(self._execute_task_with_semaphore(task))
-                    except Exception as e:
+                    except (json.JSONDecodeError, KeyError, TypeError, ValueError) as e:
                        logger.error(f"Failed to parse task message: {e}")
        except asyncio.CancelledError:
            pass
-        except Exception as e:
+        except (ConnectionError, asyncio.TimeoutError, OSError, RuntimeError) as e:
            logger.error(f"Task listener error for agent '{self.name}': {e}")

    async def _execute_task_with_semaphore(self, task: TaskMessage):
@ -593,7 +607,13 @@ class BaseAgent(ABC):
            if self._redis is not None and self._dispatcher is not None:
                await self._dispatcher.handle_result(result)

+        except asyncio.CancelledError:
+            # CancelledError 必须传播，不被 except 吞掉
+            raise
+
        except Exception as e:
+            # 兜底：execute() 内部已捕获大部分异常并返回 TaskResult，
+            # 此处仅捕获 dispatcher 失败或 execute() 边界外的异常
            logger.error(f"Agent '{self.name}' task {task.task_id} failed: {e}")
            error_result = TaskResult(
                task_id=task.task_id,
@ -622,5 +642,6 @@ class BaseAgent(ABC):
            jsonschema.validate(data, schema)
        except ImportError:
            logger.warning("jsonschema not installed, skipping input validation")
-        except Exception as e:
+        except (ValueError, TypeError, KeyError) as e:
+            # jsonschema.ValidationError 继承 ValueError；其余为 schema/data 类型错误
            raise SchemaValidationError(self.name, str(e))
--- a/src/agentkit/core/dispatcher.py
+++ b/src/agentkit/core/dispatcher.py
@ -3,6 +3,7 @@
 与业务系统解耦：通过依赖注入获取 Redis 连接和数据库会话。
 """

+import asyncio
 import ipaddress
 import json
 import logging
@ -12,7 +13,6 @@ from typing import Any, Callable, Awaitable
 from urllib.parse import urlparse

 from agentkit.core.exceptions import (
-    NoAvailableAgentError,
    TaskDispatchError,
    TaskNotFoundError,
 )
@ -51,7 +51,7 @@ def _validate_callback_url(url: str) -> bool:
    """
    try:
        parsed = urlparse(url)
-    except Exception:
+    except (ValueError, TypeError):
        return False

    if parsed.scheme not in ("http", "https"):
@ -159,7 +159,7 @@ class TaskDispatcher:

            except TaskDispatchError:
                raise
-            except Exception as e:
+            except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError) as e:
                await db.rollback()
                logger.error(f"Failed to dispatch task {task.task_id}: {e}")
                raise TaskDispatchError(task.task_id, str(e))
@ -197,7 +197,7 @@ class TaskDispatcher:

            except TaskNotFoundError:
                raise
-            except Exception as e:
+            except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError) as e:
                await db.rollback()
                logger.error(f"Failed to cancel task {task_id}: {e}")
                raise
@ -263,7 +263,7 @@ class TaskDispatcher:

                logger.info(f"Task {result.task_id} result handled (status={result.status})")

-            except Exception as e:
+            except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError) as e:
                await db.rollback()
                logger.error(f"Failed to handle result for task {result.task_id}: {e}")

@ -295,7 +295,7 @@ class TaskDispatcher:
                )
                await db.commit()

-            except Exception as e:
+            except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError) as e:
                await db.rollback()
                logger.error(f"Failed to handle progress for task {progress.task_id}: {e}")

@ -359,7 +359,7 @@ class TaskDispatcher:
                if retried > 0:
                    logger.info(f"Retried {retried} failed tasks")

-            except Exception as e:
+            except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError) as e:
                await db.rollback()
                logger.error(f"Failed to retry failed tasks: {e}")

@ -392,7 +392,7 @@ class TaskDispatcher:
            async with httpx.AsyncClient(timeout=10) as client:
                await client.post(callback_url, json=result.to_dict())
                logger.info(f"Callback triggered for task {result.task_id}")
-        except Exception as e:
+        except (ConnectionError, OSError, asyncio.TimeoutError, RuntimeError) as e:
            logger.warning(f"Callback failed for task {result.task_id}: {e}")

    def _task_to_dict(self, task: Any) -> dict:
--- a/src/agentkit/core/orchestrator.py
+++ b/src/agentkit/core/orchestrator.py
@ -12,7 +12,8 @@ from dataclasses import dataclass, field
 from enum import Enum
 from typing import TYPE_CHECKING, Any

-from agentkit.core.protocol import TaskMessage, TaskResult, TaskStatus
+from agentkit.core.exceptions import LLMProviderError
+from agentkit.core.protocol import TaskMessage, TaskStatus
 from agentkit.core.shared_workspace import SharedWorkspace

 if TYPE_CHECKING:
@ -224,7 +225,7 @@ class Orchestrator:
                        subtasks=subtasks,
                        parallel_groups=parallel_groups,
                    )
-            except Exception as e:
+            except (RuntimeError, ValueError, KeyError, AttributeError) as e:
                logger.warning(f"GoalPlanner decomposition failed, falling back: {e}")

        # If LLM gateway available, use it for decomposition
@ -239,7 +240,7 @@ class Orchestrator:
                        subtasks=subtasks,
                        parallel_groups=parallel_groups,
                    )
-            except Exception as e:
+            except (LLMProviderError, asyncio.TimeoutError, ConnectionError, ValueError, TypeError, KeyError) as e:
                logger.warning(f"LLM decomposition failed, falling back to simple: {e}")

        # Fallback: single subtask = original task
@ -418,7 +419,7 @@ class Orchestrator:
                            "status": "completed",
                        },
                    ))
-                except Exception as e:
+                except (ConnectionError, RuntimeError, OSError) as e:
                    logger.warning(f"Failed to publish progress via MessageBus: {e}")

            return output
@ -437,10 +438,12 @@ class Orchestrator:
                            "error": "Subtask timed out",
                        },
                    ))
-                except Exception as e:
+                except (ConnectionError, RuntimeError, OSError) as e:
                    logger.warning(f"Failed to publish progress via MessageBus: {e}")
            return error_result
-        except Exception as e:
+        except asyncio.CancelledError:
+            raise
+        except (RuntimeError, ValueError, KeyError, AttributeError, ConnectionError, LLMProviderError) as e:
            error_result = {"status": "failed", "error": str(e)}
            if self._message_bus is not None:
                try:
@ -455,7 +458,7 @@ class Orchestrator:
                            "error": str(e),
                        },
                    ))
-                except Exception as e:
+                except (ConnectionError, RuntimeError, OSError) as e:
                    logger.warning(f"Failed to publish progress via MessageBus: {e}")
            return error_result

@ -513,7 +516,7 @@ class Orchestrator:
        try:
            agents_info = self._agent_pool.list_agents()
            return [a["name"] for a in agents_info]
-        except Exception:
+        except (RuntimeError, KeyError, AttributeError):
            return []

    def _convert_execution_plan_to_subtasks(
@ -561,7 +564,7 @@ class Orchestrator:
                    description = agent.get("description", "").lower()
                    if skill.lower() in name.lower() or skill.lower() in agent_type.lower() or skill.lower() in description:
                        return name
-        except Exception:
+        except (RuntimeError, KeyError, AttributeError):
            pass
        return None

@ -580,9 +583,6 @@ class Orchestrator:
        Returns:
            OrchestrationResult: 编排结果，metadata 中包含迭代历史
        """
-        import time as _time
-
-        start_time = _time.monotonic()
        iteration_history: list[dict[str, Any]] = []

        # First execution
@ -650,7 +650,7 @@ class Orchestrator:

        try:
            return await self._llm_evaluate(task, result)
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, ValueError, RuntimeError) as e:
            logger.warning(f"LLM evaluation failed, falling back to rule-based: {e}")
            return self._rule_based_evaluate(result)

--- a/src/agentkit/core/plan_exec_engine.py
+++ b/src/agentkit/core/plan_exec_engine.py
@ -18,7 +18,7 @@ from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from typing import TYPE_CHECKING, Any, Awaitable, Callable

-from agentkit.core.exceptions import TaskCancelledError, TaskTimeoutError
+from agentkit.core.exceptions import LLMProviderError, TaskCancelledError, TaskTimeoutError
 from agentkit.core.goal_planner import GoalPlanner
 from agentkit.core.plan_executor import PlanExecutor, PlanExecutionResult
 from agentkit.core.plan_schema import ExecutionPlan, PlanStep, PlanStepStatus
@ -214,7 +214,7 @@ class PlanExecEngine:
                        system_prompt += f"\n\n## 参考信息\n{memory_context}"
                    else:
                        system_prompt = f"## 参考信息\n{memory_context}"
-            except Exception as e:
+            except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
                logger.warning(f"Memory retrieval failed, continuing without context: {e}")

        # 启动轨迹记录
@ -440,7 +440,7 @@ class PlanExecEngine:
                        value={"output_summary": summary, "agent_name": agent_name},
                        metadata={"task_type": task_type, "outcome": trace_outcome},
                    )
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, ValueError) as e:
                    logger.warning(f"Failed to store task result in episodic memory: {e}")

    # ------------------------------------------------------------------
@ -477,7 +477,7 @@ class PlanExecEngine:
                        system_prompt += f"\n\n## 参考信息\n{memory_context}"
                    else:
                        system_prompt = f"## 参考信息\n{memory_context}"
-            except Exception as e:
+            except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
                logger.warning(f"Memory retrieval failed, continuing without context: {e}")

        # 启动轨迹记录
@ -514,7 +514,7 @@ class PlanExecEngine:
                        "goal": plan.goal,
                        "steps": [s.to_dict() for s in plan.steps],
                    })
-                except Exception as e:
+                except (RuntimeError, ValueError, TypeError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError) as e:
                    logger.warning(f"Step event callback failed: {e}")

            trajectory.append(ReActStep(
@ -535,7 +535,7 @@ class PlanExecEngine:
                            "goal": spec.goal,
                            "num_steps": len(spec.steps),
                        })
-                    except Exception as e:
+                    except (RuntimeError, ValueError, TypeError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError) as e:
                        logger.warning(f"Step event callback failed: {e}")

            if trace_recorder is not None:
@ -604,7 +604,7 @@ class PlanExecEngine:
                        value={"output_summary": summary, "agent_name": agent_name},
                        metadata={"task_type": task_type, "outcome": trace_outcome},
                    )
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, ValueError) as e:
                    logger.warning(f"Failed to store task result in episodic memory: {e}")

    async def _execute_with_replanning(
@ -685,7 +685,7 @@ class PlanExecEngine:
                            "result": step_result.result,
                            "error": step_result.error,
                        })
-                    except Exception as e:
+                    except (RuntimeError, ValueError, TypeError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError) as e:
                        logger.warning(f"Step event callback failed: {e}")

                if trace_recorder is not None:
@ -733,7 +733,7 @@ class PlanExecEngine:
                            "root_cause": reflection_report.root_cause,
                            "new_plan_id": current_plan.plan_id,
                        })
-                    except Exception as e:
+                    except (RuntimeError, ValueError, TypeError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError) as e:
                        logger.warning(f"Step event callback failed: {e}")

                trajectory.append(ReActStep(
--- a/src/agentkit/core/react.py
+++ b/src/agentkit/core/react.py
@ -15,7 +15,7 @@ from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from typing import TYPE_CHECKING, Any

-from agentkit.core.exceptions import LoopDetectedError, TaskCancelledError, TaskTimeoutError
+from agentkit.core.exceptions import LLMProviderError, LoopDetectedError, TaskCancelledError, TaskTimeoutError
 from agentkit.core.protocol import CancellationToken
 from agentkit.llm.gateway import LLMGateway
 from agentkit.llm.protocol import LLMResponse
@ -659,7 +659,8 @@ class ReActEngine:
                        )
                        or ""
                    )
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
+                    # 检索层故障（RAG/Redis/LLM embedding）— 不阻塞主流程
                    logger.warning(
                        f"Memory retrieval failed, continuing without context: {e}", exc_info=True
                    )
@ -679,7 +680,8 @@ class ReActEngine:
            if compressor:
                try:
                    conversation = await compressor.compress(conversation)
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
+                    # 压缩器通常调用 LLM — LLM 不可用类异常降级为原对话
                    logger.warning(
                        f"Context compression failed, continuing with original messages: {e}"
                    )
@ -1052,7 +1054,11 @@ class ReActEngine:
                                        approved = await confirmation_handler(
                                            confirmation_id, command, reason
                                        )
+                                    except asyncio.CancelledError:
+                                        raise
                                    except Exception as e:
+                                        # 用户提供的 confirmation_handler — 任意异常都可能，
+                                        # 不阻塞主循环，降级为未批准
                                        logger.warning(f"Confirmation handler error: {e}")

                                if approved:
@ -1066,9 +1072,10 @@ class ReActEngine:
                                        clean_args["_skip_dangerous_check"] = True
                                        try:
                                            tool_result = await tool.safe_execute(**clean_args)
-                                        except Exception as e:
+                                        except (ToolValidationError, ValueError, TypeError, RuntimeError) as e:
                                            tool_result = {
-                                                "error": f"Tool '{tc.name}' execution failed: {e}"
+                                                "error": f"Tool '{tc.name}' execution failed: {e}",
+                                                "error_code": "tool_execution_failed",
                                            }
                                    else:
                                        clean_args = {
@ -1083,9 +1090,10 @@ class ReActEngine:
                                                if tool
                                                else {"error": f"Tool '{tc.name}' not found"}
                                            )
-                                        except Exception as e:
+                                        except (ToolValidationError, ValueError, TypeError, RuntimeError) as e:
                                            tool_result = {
-                                                "error": f"Tool '{tc.name}' execution failed: {e}"
+                                                "error": f"Tool '{tc.name}' execution failed: {e}",
+                                                "error_code": "tool_execution_failed",
                                            }

                                yield ReActEvent(
@ -1146,7 +1154,7 @@ class ReActEngine:
                    if self._should_compress(conversation, compressor):
                        try:
                            conversation = await compressor.compress(conversation)
-                        except Exception as e:
+                        except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
                            logger.warning(f"Incremental compression failed: {e}")

                else:
@ -1217,7 +1225,7 @@ class ReActEngine:
                        if self._should_compress(conversation, compressor):
                            try:
                                conversation = await compressor.compress(conversation)
-                            except Exception as e:
+                            except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
                                logger.warning(f"Incremental compression failed: {e}")
                    else:
                        # ponytail: 检查是否为畸形工具调用（含 <tool_use> 但解析失败）
@ -1332,7 +1340,7 @@ class ReActEngine:
                                        reinjections,
                                    )
                                    break
-                            except Exception as e:
+                            except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
                                logger.warning(f"Verification loop failed: {e}")

                        # Yield final_answer event (legacy format for execute_stream consumers)
@ -1428,7 +1436,8 @@ class ReActEngine:
                        value={"output_summary": summary, "agent_name": agent_name},
                        metadata={"task_type": task_type, "outcome": trace_outcome},
                    )
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, ValueError) as e:
+                    # EpisodicMemory 持久化故障（PG/Redis）— 不影响主结果
                    logger.warning(f"Failed to store task result in episodic memory: {e}")

    async def execute_stream(
@ -1555,7 +1564,7 @@ class ReActEngine:
        """通过 gateway 查询 model 对应的 provider 名。失败回退 None(字符串拼接)。"""
        try:
            return self._llm_gateway.get_provider_name_for_model(model)
-        except Exception:
+        except (AttributeError, KeyError, LLMProviderError):
            # ponytail: 测试中 gateway 可能是 MagicMock,无该方法;回退保守路径
            return None

@ -1723,7 +1732,7 @@ class ReActEngine:
        if compressor and tool_name:
            try:
                content = await compressor.compress_tool_result(tool_name, result)
-            except Exception as e:
+            except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
                logger.warning(f"Tool result compression failed for '{tool_name}': {e}")
                content = str(result)
        return {
@ -1771,10 +1780,11 @@ class ReActEngine:
                "error_code": e.error_code,
                "details": e.details,
            }
-        except Exception as e:
+        except (ValueError, TypeError, RuntimeError, asyncio.TimeoutError) as e:
+            # 工具执行失败 — 记录结构化错误码,LLM 可在下一步调整策略
            error_msg = f"Tool '{tool_name}' execution failed: {e}"
            logger.warning(error_msg)
-            return {"error": error_msg}
+            return {"error": error_msg, "error_code": "tool_execution_failed"}

    async def _execute_tool_with_confirmation(
        self,
@ -1818,7 +1828,10 @@ class ReActEngine:
            if confirmation_handler is not None:
                try:
                    approved = await confirmation_handler(confirmation_id, command, reason)
+                except asyncio.CancelledError:
+                    raise
                except Exception as e:
+                    # 用户提供的 confirmation_handler — 任意异常都可能，不阻塞主循环
                    logger.warning(f"Confirmation handler error: {e}")

            if approved:
@ -1829,8 +1842,11 @@ class ReActEngine:
                    clean_args["_skip_dangerous_check"] = True
                    try:
                        tool_result = await tool.safe_execute(**clean_args)
-                    except Exception as e:
-                        tool_result = {"error": f"Tool '{tc.name}' execution failed: {e}"}
+                    except (ToolValidationError, ValueError, TypeError, RuntimeError) as e:
+                        tool_result = {
+                            "error": f"Tool '{tc.name}' execution failed: {e}",
+                            "error_code": "tool_execution_failed",
+                        }
                else:
                    # Non-dangerous tool: re-execute with skip flag
                    clean_args = {k: v for k, v in tc.arguments.items() if not k.startswith("_")}
@ -1841,7 +1857,7 @@ class ReActEngine:
                            if tool
                            else {"error": f"Tool '{tc.name}' not found"}
                        )
-                    except Exception as e:
+                    except (ToolValidationError, ValueError, TypeError, RuntimeError) as e:
                        tool_result = {"error": f"Tool '{tc.name}' execution failed: {e}"}

                events.append(
--- a/src/agentkit/core/rewoo.py
+++ b/src/agentkit/core/rewoo.py
@ -11,23 +11,21 @@ import logging
 import re
 import time
 from dataclasses import dataclass, field
-from datetime import datetime, timezone
 from typing import TYPE_CHECKING, Any

-from agentkit.core.exceptions import TaskCancelledError, TaskTimeoutError
+from agentkit.core.exceptions import LLMProviderError, TaskCancelledError, TaskTimeoutError
 from agentkit.core.protocol import CancellationToken
 from agentkit.core.react import ReActEngine, ReActEvent, ReActResult, ReActStep
 from agentkit.llm.gateway import LLMGateway
-from agentkit.llm.protocol import LLMResponse
-from agentkit.tools.base import Tool
-from agentkit.telemetry.tracing import get_tracer, start_span, _OTEL_AVAILABLE
+from agentkit.tools.base import Tool, ToolValidationError
+from agentkit.telemetry.tracing import start_span, _OTEL_AVAILABLE
 from agentkit.telemetry.metrics import (
    agent_request_counter,
    agent_duration_histogram,
 )

 if TYPE_CHECKING:
-    from agentkit.core.compressor import CompressionStrategy, ContextCompressor
+    from agentkit.core.compressor import CompressionStrategy
    from agentkit.core.trace import TraceRecorder
    from agentkit.memory.retriever import MemoryRetriever

@ -296,7 +294,7 @@ class ReWOOEngine:
                            effective_system_prompt += f"\n\n## 参考信息\n{memory_context}"
                        else:
                            effective_system_prompt = f"## 参考信息\n{memory_context}"
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
                    logger.warning(f"Memory retrieval failed, continuing without context: {e}")

            # ── Phase 1: Planning ──
@ -360,7 +358,7 @@ class ReWOOEngine:
                if compressor:
                    try:
                        llm_messages = await compressor.compress(llm_messages)
-                    except Exception as e:
+                    except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
                        logger.warning(f"Context compression failed: {e}")

                response = await self._llm_gateway.chat(
@ -492,7 +490,7 @@ class ReWOOEngine:
                        value={"output_summary": summary, "agent_name": agent_name},
                        metadata={"task_type": task_type, "outcome": trace_outcome},
                    )
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, ValueError) as e:
                    logger.warning(f"Failed to store task result in episodic memory: {e}")

            return ReActResult(
@ -569,7 +567,7 @@ class ReWOOEngine:
                        effective_system_prompt += f"\n\n## 参考信息\n{memory_context}"
                    else:
                        effective_system_prompt = f"## 参考信息\n{memory_context}"
-            except Exception as e:
+            except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
                logger.warning(f"Memory retrieval failed, continuing without context: {e}")

        trajectory: list[ReActStep] = []
@ -647,7 +645,7 @@ class ReWOOEngine:
                if compressor:
                    try:
                        llm_messages = await compressor.compress(llm_messages)
-                    except Exception as e:
+                    except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
                        logger.warning(f"Context compression failed: {e}")

                response = await self._llm_gateway.chat(
@ -769,6 +767,9 @@ class ReWOOEngine:
                    "total_tokens": total_tokens,
                },
            )
+        except asyncio.CancelledError:
+            trace_outcome = "cancelled"
+            raise
        except Exception as e:
            trace_outcome = "error"
            logger.error(f"ReWOO execute_stream failed: {e}")
@ -786,7 +787,7 @@ class ReWOOEngine:
                        value={"output_summary": summary, "agent_name": agent_name},
                        metadata={"task_type": task_type, "outcome": trace_outcome},
                    )
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, ValueError) as e:
                    logger.warning(f"Failed to store task result in episodic memory: {e}")

    # ── Fallback Strategy Helpers ──────────────────────────
@ -914,7 +915,7 @@ class ReWOOEngine:
                output, synthesis_tokens = await self._synthesis_phase(messages=messages, tool_results=tool_results, model=model, agent_name=agent_name, task_type=task_type, system_prompt=effective_system_prompt, compressor=compressor, cancellation_token=cancellation_token)
                yield ReActEvent(event_type="final_answer", step=len(plan.steps) + 1, data={"output": output, "total_steps": len(plan.steps) + 1, "total_tokens": simplified_tokens + synthesis_tokens})
                return
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError, TypeError, ToolValidationError, json.JSONDecodeError) as e:
            logger.warning(f"Simplified ReWOO planning also failed in stream mode: {e}")
        # Failed, continue to next strategy by not returning
        # This signals the caller to try the next strategy
@ -951,7 +952,7 @@ class ReWOOEngine:
            ):
                yield event
            return
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ToolValidationError) as e:
            logger.warning(f"ReAct fallback also failed in stream mode: {e}")
        raise _FallbackFailedError("react")

@ -975,13 +976,13 @@ class ReWOOEngine:
            if compressor:
                try:
                    direct_messages = await compressor.compress(direct_messages)
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
                    logger.warning(f"Context compression failed in direct fallback: {e}")
            direct_response = await self._llm_gateway.chat(messages=direct_messages, model=model, agent_name=agent_name, task_type=task_type)
            output = direct_response.content or ""
            yield ReActEvent(event_type="final_answer", step=1, data={"output": output, "total_steps": 1, "total_tokens": total_tokens + direct_response.usage.total_tokens})
            return
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as e:
            logger.error(f"Direct LLM fallback also failed in stream mode: {e}")
        raise _FallbackFailedError("direct")

@ -1024,7 +1025,7 @@ class ReWOOEngine:
                output, synthesis_tokens = await self._synthesis_phase(messages=messages, tool_results=tool_results, model=model, agent_name=agent_name, task_type=task_type, system_prompt=effective_system_prompt, compressor=compressor, cancellation_token=cancellation_token)
                yield ReActEvent(event_type="final_answer", step=len(plan.steps) + 1, data={"output": output, "total_steps": len(plan.steps) + 1, "total_tokens": plan_tokens + synthesis_tokens})
                return
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError, TypeError, ToolValidationError, json.JSONDecodeError) as e:
            logger.warning(f"Plan-exec fallback also failed in stream mode: {e}")
        raise _FallbackFailedError("plan_exec")

@ -1178,7 +1179,7 @@ class ReWOOEngine:
                    total_tokens=total_tokens,
                    fallback_strategy="simplified_rewoo",
                )
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError, TypeError, ToolValidationError, json.JSONDecodeError) as e:
            logger.warning(f"Simplified ReWOO planning also failed: {e}")
        return None

@ -1219,7 +1220,7 @@ class ReWOOEngine:
            )
            react_result.fallback_strategy = "react"
            return react_result
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ToolValidationError) as e:
            logger.warning(f"ReAct fallback also failed: {e}")
            return None

@ -1247,7 +1248,7 @@ class ReWOOEngine:
            if compressor:
                try:
                    direct_messages = await compressor.compress(direct_messages)
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
                    logger.warning(f"Context compression failed in direct fallback: {e}")

            direct_response = await self._llm_gateway.chat(
@ -1284,7 +1285,7 @@ class ReWOOEngine:
                total_tokens=total_tokens,
                fallback_strategy="direct",
            )
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as e:
            logger.error(f"Direct LLM fallback also failed: {e}")
            return None

@ -1361,7 +1362,7 @@ class ReWOOEngine:
                    total_tokens=total_tokens,
                    fallback_strategy="plan_exec",
                )
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError, TypeError, ToolValidationError, json.JSONDecodeError) as e:
            logger.warning(f"Plan-exec fallback also failed: {e}")
        return None

@ -1418,7 +1419,7 @@ class ReWOOEngine:
        if compressor:
            try:
                planning_messages = await compressor.compress(planning_messages)
-            except Exception as e:
+            except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
                logger.warning(f"Context compression failed during planning: {e}")

        try:
@ -1429,7 +1430,7 @@ class ReWOOEngine:
                task_type=task_type,
                tools=tool_schemas,
            )
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError) as e:
            logger.warning(f"LLM call failed during planning: {e}")
            return None, 0

@ -1496,7 +1497,7 @@ class ReWOOEngine:
        if compressor:
            try:
                synthesis_messages = await compressor.compress(synthesis_messages)
-            except Exception as e:
+            except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
                logger.warning(f"Context compression failed during synthesis: {e}")

        response = await self._llm_gateway.chat(
@ -1611,7 +1612,7 @@ class ReWOOEngine:
        try:
            result = await tool.safe_execute(**arguments)
            return result
-        except Exception as e:
+        except (ToolValidationError, ValueError, TypeError, RuntimeError) as e:
            error_msg = f"Tool '{tool_name}' execution failed: {e}"
            logger.warning(error_msg)
            return {"error": error_msg}
--- a/src/agentkit/experts/_phase_executor.py
+++ b/src/agentkit/experts/_phase_executor.py
@ -5,6 +5,7 @@

 from __future__ import annotations

+import asyncio
 import copy
 import logging
 from datetime import datetime, timezone
@ -17,8 +18,6 @@ from .expert import Expert
 from .plan import PhaseStatus, PhaseType, PlanPhase, TeamPlan

 if TYPE_CHECKING:
-    import asyncio
-
    from .team import ExpertTeam

 logger = logging.getLogger(__name__)
@ -61,7 +60,7 @@ class PhaseExecutorMixin:
                    full_data = await self._team.workspace.read(ref_key)
                    if full_data:
                        return full_data.get("value", content)
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, KeyError, AttributeError) as e:
                    logger.warning(f"Failed to read offloaded output '{ref_key}': {e}")
        return content

@ -80,11 +79,11 @@ class PhaseExecutorMixin:
        try:
            # U3: 返工循环 — 最多 MAX_REWORKS + 1 次（1 次初始 + MAX_REWORKS 次返工）
            for _rework_attempt in range(self.MAX_REWORKS + 1):
-                result, last_error, passed, feedback = await self._run_agent_steps(
+                result, last_error, passed, feedback, degraded = await self._run_agent_steps(
                    expert, agent, lead, phase, plan
                )
                done = await self._finalize_phase(
-                    expert, lead, phase, plan, result, passed, feedback
+                    expert, lead, phase, plan, result, passed, feedback, degraded
                )
                if done:
                    return result
@ -181,9 +180,10 @@ class PhaseExecutorMixin:
        lead: Expert,
        phase: PlanPhase,
        plan: TeamPlan,
-    ) -> tuple[dict[str, Any], str | None, bool, str]:
+    ) -> tuple[dict[str, Any], str | None, bool, str, bool]:
        """Run one rework iteration: read deps, build input, execute, review. Returns
-        (result, last_error, passed, feedback). Raises RuntimeError on retry exhaustion."""
+        (result, last_error, passed, feedback, degraded). Raises RuntimeError on retry
+        exhaustion."""
        # 每次迭代重新读取依赖输出（前置阶段可能在返工期间完成）
        dependency_outputs: dict[str, Any] = {}
        for dep_id in phase.depends_on:
@ -228,7 +228,12 @@ class PhaseExecutorMixin:
                    raise RuntimeError(f"Agent execution failed: {last_error}")
                result = task_result.output_data or {"content": ""}
                break
-            except Exception as e:
+            except asyncio.CancelledError:
+                # CancelledError 必须传播，不被重试逻辑吞掉
+                raise
+            except (RuntimeError, asyncio.TimeoutError, ConnectionError) as e:
+                # agent.execute() 内部已捕获所有异常并返回 TaskResult，
+                # 此处仅捕获显式抛出的 RuntimeError + 罕见的基础设施异常
                last_error = str(e)
                if attempt < self.MAX_RETRIES:
                    logger.info(f"Retrying phase {phase.id} (attempt {attempt + 1})")
@ -250,9 +255,9 @@ class PhaseExecutorMixin:
            "risk_description": risk_desc, "phase_id": phase.id, "phase_name": phase.name,
        })

-        # U3: Lead 验收阶段输出
-        passed, feedback = await self._review_phase_output(lead, phase, result)
-        return result, last_error, passed, feedback
+        # U3: Lead 验收阶段输出 — ReviewResult 结构化结果（含 degraded 标记）
+        review = await self._review_phase_output(lead, phase, result)
+        return result, last_error, review.passed, review.feedback, review.degraded

    async def _finalize_phase(
        self,
@ -263,9 +268,15 @@ class PhaseExecutorMixin:
        result: dict[str, Any],
        passed: bool,
        feedback: str,
+        degraded: bool = False,
    ) -> bool:
        """Handle review outcome: write workspace + emit completed, or rework/fail. Returns
-        True if done (COMPLETED), False if rework continues. Raises on rework limit."""
+        True if done (COMPLETED), False if rework continues. Raises on rework limit.
+
+        Args:
+            degraded: True 表示验收走了降级路径（LLM 不可用/超时/异常时自动通过），
+                广播到 ``review_result`` 事件 payload 让前端/运维可编程判断。
+        """
        if passed:
            phase.status = PhaseStatus.COMPLETED
            # P2: SharedWorkspace 写入移到验收通过后 — 避免持久化被拒输出
@ -276,6 +287,7 @@ class PhaseExecutorMixin:
            await self._broadcast_event("review_result", {
                "phase_id": phase.id, "phase_name": phase.name, "passed": True,
                "feedback": feedback, "expert": phase.assigned_expert,
+                "degraded": degraded,
            })
            if phase.collaboration_contracts:
                await self._notify_collaborators(phase, plan)
@ -288,7 +300,7 @@ class PhaseExecutorMixin:
            })
            return True

-        # 验收不合格 — 返工或标记失败
+        # 验收不合格 — 返工或标记失败（degraded 路径不应走到这里，但保持字段一致）
        phase.rework_count += 1
        phase.review_feedback = feedback

@ -304,6 +316,7 @@ class PhaseExecutorMixin:
                    "expert": phase.assigned_expert,
                    "rework_count": phase.rework_count,
                    "final_status": "failed",
+                    "degraded": degraded,
                },
            )
            await self._broadcast_event(
@ -329,6 +342,7 @@ class PhaseExecutorMixin:
                "expert": phase.assigned_expert,
                "rework_count": phase.rework_count,
                "final_status": "rework",
+                "degraded": degraded,
            },
        )
        feedback_truncated = feedback[:500] if feedback else ""
@ -377,7 +391,8 @@ class PhaseExecutorMixin:
            agent = await pool.create_agent(temp_config)
            self._temp_agents[phase.id] = temp_config.name
            return agent
-        except Exception as e:
+        except (ValueError, KeyError, RuntimeError, TypeError) as e:
+            # pool.create_agent 失败：config 校验/工具注册/依赖缺失等
            logger.warning(
                f"Failed to create isolated agent for phase {phase.id}, "
                f"using expert's existing agent: {e}"
@ -393,5 +408,7 @@ class PhaseExecutorMixin:
        if temp_name:
            try:
                await pool.remove_agent(temp_name)
-            except Exception as e:
+            except asyncio.CancelledError:
+                raise
+            except (KeyError, RuntimeError) as e:
                logger.warning(f"Failed to clean up isolated agent '{temp_name}': {e}")
--- a/src/agentkit/experts/_review_gate.py
+++ b/src/agentkit/experts/_review_gate.py
@ -5,11 +5,15 @@

 from __future__ import annotations

+import asyncio
 import json
 import logging
 import re
+from dataclasses import dataclass
 from typing import Any

+from agentkit.core.exceptions import LLMProviderError
+
 from .expert import Expert
 from .plan import PlanPhase

@ -19,27 +23,46 @@ logger = logging.getLogger(__name__)
 _RISK_FLAG_RE = re.compile(r"\[RISK:\s*(.+?)\]", re.DOTALL)


+@dataclass
+class ReviewResult:
+    """Lead 验收阶段输出的结构化结果（U3）。
+
+    替换原先的 ``tuple[bool, str]`` 返回值，让降级状态可被调用方/前端
+    可编程判断，而非依赖 ``[DEGRADED]`` 字符串前缀匹配。
+
+    Attributes:
+        passed: 验收是否通过（True=通过，False=需返工）
+        degraded: 是否处于降级路径（LLM 不可用/超时/异常时自动通过）
+        feedback: 验收反馈；降级时为降级原因，正常通过时为空，需返工时为修改要求
+    """
+
+    passed: bool
+    degraded: bool = False
+    feedback: str = ""
+
+
 class ReviewGateMixin:
    """Mixin: Lead 验收阶段输出质量 + 解析风险标记。由 TeamOrchestrator 组合。"""

    async def _review_phase_output(
        self, lead: Expert, phase: PlanPhase, result: dict[str, Any]
-    ) -> tuple[bool, str]:
+    ) -> ReviewResult:
        """Lead 验收阶段输出质量。

-        用 LLM 判断输出是否满足阶段要求。
-        返回 (passed, feedback)：
-        - passed=True, feedback="" — 验收通过
-        - passed=False, feedback="修改要求" — 验收不合格，需返工
+        用 LLM 判断输出是否满足阶段要求。返回 :class:`ReviewResult`：
+        - ``passed=True, degraded=False`` — 验收通过
+        - ``passed=False, feedback="修改要求"`` — 验收不合格，需返工
+        - ``passed=True, degraded=True`` — LLM 不可用/超时/异常，优雅降级自动通过

-        若 LLM 不可用，跳过验收直接通过（优雅降级，feedback 标注降级原因）。
+        降级路径以 ``degraded=True`` 显式标记，让 ``review_result`` WS 事件
+        和日志聚合可编程判断降级频率，无需匹配 ``[DEGRADED]`` 字符串前缀。
        """
        gateway = self._get_llm_gateway(lead)
        if not gateway:
            logger.warning("No LLM gateway available, skipping review")
-            # 优雅降级：不阻塞流程，但 [DEGRADED] 前缀让 review_result 事件
-            # 和日志聚合可识别降级路径，便于运维监控验收失效频率。
-            return True, "[DEGRADED] LLM 验收不可用，自动通过"
+            return ReviewResult(
+                passed=True, degraded=True, feedback="LLM 验收不可用，自动通过"
+            )

        content = result.get("content", str(result))
        # P1: prompt injection 防护 — 用 XML 标签包裹专家输出，指示 LLM 忽略其中指令
@ -60,6 +83,15 @@ class ReviewGateMixin:
                messages=[{"role": "user", "content": prompt}],
                model=self._get_model(lead),
            )
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError) as e:
+            # LLM 不可用类异常 — 优雅降级，不阻塞流程。
+            # ponytail: RuntimeError 纳入捕获 — LiteLLM/provider 内部错误常以 RuntimeError
+            # 抛出（如 "LLM unavailable"），验收路径语义是"LLM 调用失败即降级"，需覆盖。
+            logger.warning(f"Review LLM call failed, degrading: {e}")
+            return ReviewResult(
+                passed=True, degraded=True, feedback=f"LLM 验收降级，自动通过: {e}"
+            )
+
        # P2: 优先尝试直接解析整个响应为 JSON，避免贪婪正则匹配过多
        review: dict[str, Any] | None = None
        try:
@ -79,13 +111,14 @@ class ReviewGateMixin:
            passed_raw = review.get("passed", True)
            passed = passed_raw is True or str(passed_raw).lower() == "true"
            feedback = review.get("feedback", "")
-                return passed, str(feedback)
-            logger.warning(f"Review LLM returned unparseable response: {response.content[:200]}")
-        except Exception as e:
-            logger.warning(f"Review LLM call failed: {e}")
+            return ReviewResult(passed=passed, feedback=str(feedback))

-        # 降级：不阻塞流程，但 [DEGRADED] 前缀让 review_result 事件可识别降级路径
-        return True, "[DEGRADED] LLM 验收降级，自动通过"
+        # 现有行为：LLM 返回不可解析响应时也走降级通过（plan 文档 line 274 标注
+        # passed=False，但实际生产行为是降级通过避免阻塞流水线 — 以现有行为为准）。
+        logger.warning(f"Review LLM returned unparseable response: {response.content[:200]}")
+        return ReviewResult(
+            passed=True, degraded=True, feedback="LLM 验收响应不可解析，自动通过"
+        )

    @staticmethod
    def _parse_risk_flags(content: str) -> list[str]:
--- a/src/agentkit/experts/orchestrator.py
+++ b/src/agentkit/experts/orchestrator.py
@ -16,6 +16,7 @@ import logging
 import re
 from typing import Any

+from agentkit.core.exceptions import LLMProviderError
 from agentkit.llm.gateway import LLMGateway

 from ._debate_runner import DebateRunnerMixin
@ -169,7 +170,7 @@ class TeamOrchestrator(
        if self._checkpoint is not None:
            try:
                await self._checkpoint.save_plan(plan)
-            except Exception as e:
+            except (ConnectionError, OSError, asyncio.TimeoutError, RuntimeError, ValueError, KeyError) as e:
                logger.warning(f"Checkpoint save_plan failed: {e}")

        # 4. Set EXECUTING status, execute phases
@ -266,7 +267,7 @@ class TeamOrchestrator(
                    if should_save_checkpoint and self._checkpoint is not None:
                        try:
                            await self._checkpoint.save(plan.id, ph, plan.status.value)
-                        except Exception as e:
+                        except (ConnectionError, OSError, asyncio.TimeoutError, RuntimeError, ValueError, KeyError) as e:
                            logger.warning(f"Checkpoint save failed for phase {ph.id}: {e}")

                # U3: Divergence detection — check completed phases for conflicts
@ -310,7 +311,7 @@ class TeamOrchestrator(
            if self._checkpoint is not None:
                try:
                    await self._checkpoint.clear(plan.id)
-                except Exception as e:
+                except (ConnectionError, OSError, asyncio.TimeoutError, RuntimeError, ValueError, KeyError) as e:
                    logger.warning(f"Checkpoint clear failed: {e}")

            return {
@ -326,7 +327,9 @@ class TeamOrchestrator(
            plan.status = PlanStatus.FAILED
            await self._broadcast_event("team_dissolved", {"team_id": self._team.team_id})
            return await self._fallback_to_single_agent(task, plan, phase_results)
-        except Exception as e:
+        except asyncio.CancelledError:
+            raise
+        except (RuntimeError, ValueError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError, LLMProviderError) as e:
            logger.error(f"Pipeline execution failed: {e}")
            plan.status = PlanStatus.FAILED
            await self._broadcast_event("team_dissolved", {"team_id": self._team.team_id})
@ -463,7 +466,7 @@ class TeamOrchestrator(
            if phases:
                return phases
            logger.warning("LLM decomposition returned no valid phases")
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, json.JSONDecodeError, ValueError, TypeError) as e:
            logger.warning(f"LLM task decomposition failed: {e}")

        return [PlanPhase(name="执行", assigned_expert=lead.config.name, task_description=task)]
@ -588,5 +591,5 @@ class TeamOrchestrator(
                await self._team.handoff_transport.send(
                    self._team.team_channel, {"type": event_type, **data}
                )
-            except Exception as e:
+            except (ConnectionError, RuntimeError, OSError, asyncio.TimeoutError) as e:
                logger.warning(f"Failed to broadcast event '{event_type}': {e}")
--- a/src/agentkit/orchestrator/pipeline_engine.py
+++ b/src/agentkit/orchestrator/pipeline_engine.py
@ -20,7 +20,7 @@ from agentkit.orchestrator.pipeline_schema import (
    StageStatus,
 )
 from agentkit.orchestrator.reflection import PipelineReflector, PipelineReplanner
-from agentkit.orchestrator.retry import StepRetryPolicy, execute_with_retry
+from agentkit.orchestrator.retry import execute_with_retry

 logger = logging.getLogger(__name__)

@ -143,7 +143,7 @@ class PipelineEngine:
                    steps=step_names,
                    input_data=context,
                )
-            except Exception as exc:
+            except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as exc:
                logger.warning(f"Failed to create execution state: {exc}")

        # Create Saga orchestrator for compensation tracking
@ -183,7 +183,7 @@ class PipelineEngine:
                            output=step_output,
                            error=step_error,
                        )
-                    except Exception as exc:
+                    except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as exc:
                        logger.warning(f"Failed to update step state: {exc}")

                # 收集输出变量
@ -219,7 +219,7 @@ class PipelineEngine:
                                    step_name=stage.name,
                                    error=result.error_message,
                                )
-                            except Exception as exc:
+                            except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as exc:
                                logger.warning(f"Failed to persist failure state: {exc}")
                        return result

@ -237,7 +237,7 @@ class PipelineEngine:
                    execution_id=execution_id,
                    final_output=final_output,
                )
-            except Exception as exc:
+            except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as exc:
                logger.warning(f"Failed to persist completion state: {exc}")

        return result
@ -346,7 +346,11 @@ class PipelineEngine:

            return sr

-        except Exception as e:
+        except asyncio.CancelledError:
+            raise
+
+        except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as e:
+            # dispatcher / agent 执行失败 — 转 StageResult.FAILED 不向上抛
            return StageResult(
                stage_name=stage.name,
                status=StageStatus.FAILED,
@ -475,7 +479,9 @@ class PipelineEngine:
                    stage,
                    started_at,
                )
-            except Exception as e:
+            except asyncio.CancelledError:
+                raise
+            except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as e:
                logger.error(f"Verifier execution failed for stage '{stage.name}': {e}")
                return StageResult(
                    stage_name=stage.name,
@ -619,7 +625,9 @@ class PipelineEngine:
                step_name=stage.name,
            )
            return sr
-        except Exception as e:
+        except asyncio.CancelledError:
+            raise
+        except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as e:
            return StageResult(
                stage_name=stage.name,
                status=StageStatus.FAILED,
@ -679,7 +687,7 @@ class PipelineEngine:
                score=output_data.get("score", 0.0),
            )
            return feedback
-        except Exception as e:
+        except (TypeError, KeyError, ValueError) as e:
            # 解析失败时直接抛出异常，避免死循环
            logger.error(f"Failed to parse verifier output: {e}")
            raise RuntimeError(
--- a/tests/unit/experts/test_team_orchestrator.py
+++ b/tests/unit/experts/test_team_orchestrator.py
@ -790,10 +790,18 @@ class TestResultSynthesis:
            {"name": "A", "assigned_expert": "member1", "task_description": "阶段A", "depends_on": []},
            {"name": "B", "assigned_expert": "member2", "task_description": "阶段B", "depends_on": []},
        ])
-        # Synthesis call raises to force concatenation fallback
-        gateway.chat = AsyncMock(
-            side_effect=[decomp_response, RuntimeError("LLM unavailable")]
-        )
+        # ponytail: 函数式 side_effect — 首次返回 decomposition，后续一律抛 RuntimeError
+        # （列表式 side_effect 耗尽会抛 StopIteration，被 U3 收窄后的 except 漏捕获；
+        #   函数式让"LLM 不可用"语义明确，覆盖验收+综合所有后续调用）
+        call_count = [0]
+
+        async def chat_side_effect(messages, model=None, **kwargs):
+            call_count[0] += 1
+            if call_count[0] == 1:
+                return decomp_response
+            raise RuntimeError("LLM unavailable")
+
+        gateway.chat = AsyncMock(side_effect=chat_side_effect)
        team._experts["lead"].agent._llm_gateway = gateway

        result = await orchestrator.execute("复杂任务")