refactor(core,experts): classify except Exception + structured ReviewResult (U3)

ReviewResult dataclass (passed/degraded/feedback) replaces tuple+[DEGRADED] prefix in _review_phase_output; 3 review_result WS payloads now carry degraded field (AE3). except Exception narrowed to specific types across 10 files (core/react, rewoo, base, orchestrator, dispatcher, plan_exec_engine + experts/orchestrator, _phase_executor, _review_gate + orchestrator/pipeline_engine). Baseline 140 -> 66 occurrences (>=50% reduction). Fix RuntimeError regression: review-gate + compression paths now catch RuntimeError (LLM/provider internal errors) to preserve degradation semantics. Test side_effect switched to functional form to avoid StopIteration on list exhaustion. ruff clean; 135 key + 469 experts + 163 core tests pass.
2026-06-30 18:03:58 +08:00 · 2026-06-30 18:03:58 +08:00 · be5c4e09f8
parent 47ee2449df
commit be5c4e09f8
11 changed files with 256 additions and 149 deletions
--- a/src/agentkit/core/base.py
+++ b/src/agentkit/core/base.py
@ -246,7 +246,7 @@ class BaseAgent(ABC):
                self._redis = aioredis.from_url(redis_url, decode_responses=True)
                await self._redis.ping()
                logger.info(f"Agent '{self.name}' connected to Redis")
-            except Exception as e:
+            except (ConnectionError, OSError, asyncio.TimeoutError, ValueError) as e:
                self._redis = None
                logger.warning(
                    f"Agent '{self.name}' Redis unavailable: {e}, falling back to local mode"
@ -380,7 +380,10 @@ class BaseAgent(ABC):
            # 失败钩子
            try:
                await self.on_task_failed(task, TaskCancelledError(task.task_id))
            except asyncio.CancelledError:
                raise
            except Exception as hook_err:
                # 用户提供的 hook — 任意异常都可能，不阻塞 TaskResult 构建
                logger.error(f"on_task_failed hook error: {hook_err}")
            elapsed = time.monotonic() - start_time
@ -408,7 +411,10 @@ class BaseAgent(ABC):
                await self.on_task_failed(
                    task, TaskTimeoutError(task.task_id, task.timeout_seconds)
                )
            except asyncio.CancelledError:
                raise
            except Exception as hook_err:
                # 用户提供的 hook — 任意异常都可能，不阻塞 TaskResult 构建
                logger.error(f"on_task_failed hook error: {hook_err}")
            elapsed = time.monotonic() - start_time
@ -427,12 +433,20 @@ class BaseAgent(ABC):
                },
            )
        except asyncio.CancelledError:
            # CancelledError 必须传播，不被 except Exception 吞掉
            raise
        except Exception as e:
            # 框架边界 catch-all：handle_task 是用户实现，可能抛任意异常；
            # execute() 契约要求始终返回 TaskResult，故保留兜底。
            logger.error(f"Agent '{self.name}' task {task.task_id} failed: {e}")
            # 失败钩子
            try:
                await self.on_task_failed(task, e)
            except asyncio.CancelledError:
                raise
            except Exception as hook_err:
                logger.error(f"on_task_failed hook error: {hook_err}")
@ -517,13 +531,13 @@ class BaseAgent(ABC):
                    f"agent:{self.name}:progress",
                    json.dumps(progress_obj.to_dict()),
                )
-            except Exception as e:
+            except (ConnectionError, asyncio.TimeoutError, OSError) as e:
                logger.warning(f"Failed to publish progress for task {task_id}: {e}")
            if self._dispatcher is not None:
                try:
                    await self._dispatcher.handle_progress(progress_obj)
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, RuntimeError) as e:
                    logger.warning(
                        f"Failed to report progress to dispatcher for task {task_id}: {e}"
                    )
@ -544,7 +558,7 @@ class BaseAgent(ABC):
                await asyncio.sleep(30)
        except asyncio.CancelledError:
            pass
-        except Exception as e:
+        except (ConnectionError, asyncio.TimeoutError, OSError, RuntimeError) as e:
            logger.error(f"Heartbeat error for agent '{self.name}': {e}")
    async def _listen_for_tasks(self):
@ -565,11 +579,11 @@ class BaseAgent(ABC):
                        task_data = json.loads(task_json)
                        task = TaskMessage.from_dict(task_data)
                        asyncio.create_task(self._execute_task_with_semaphore(task))
-                    except Exception as e:
+                    except (json.JSONDecodeError, KeyError, TypeError, ValueError) as e:
                        logger.error(f"Failed to parse task message: {e}")
        except asyncio.CancelledError:
            pass
-        except Exception as e:
+        except (ConnectionError, asyncio.TimeoutError, OSError, RuntimeError) as e:
            logger.error(f"Task listener error for agent '{self.name}': {e}")
    async def _execute_task_with_semaphore(self, task: TaskMessage):
@ -593,7 +607,13 @@ class BaseAgent(ABC):
            if self._redis is not None and self._dispatcher is not None:
                await self._dispatcher.handle_result(result)
        except asyncio.CancelledError:
            # CancelledError 必须传播，不被 except 吞掉
            raise
        except Exception as e:
            # 兜底：execute() 内部已捕获大部分异常并返回 TaskResult，
            # 此处仅捕获 dispatcher 失败或 execute() 边界外的异常
            logger.error(f"Agent '{self.name}' task {task.task_id} failed: {e}")
            error_result = TaskResult(
                task_id=task.task_id,
@ -622,5 +642,6 @@ class BaseAgent(ABC):
            jsonschema.validate(data, schema)
        except ImportError:
            logger.warning("jsonschema not installed, skipping input validation")
-        except Exception as e:
+        except (ValueError, TypeError, KeyError) as e:
            # jsonschema.ValidationError 继承 ValueError；其余为 schema/data 类型错误
            raise SchemaValidationError(self.name, str(e))
--- a/src/agentkit/core/dispatcher.py
+++ b/src/agentkit/core/dispatcher.py
@ -3,6 +3,7 @@
 与业务系统解耦：通过依赖注入获取 Redis 连接和数据库会话。
 """
 import asyncio
 import ipaddress
 import json
 import logging
@ -12,7 +13,6 @@ from typing import Any, Callable, Awaitable
 from urllib.parse import urlparse
 from agentkit.core.exceptions import (
    NoAvailableAgentError,
    TaskDispatchError,
    TaskNotFoundError,
 )
@ -51,7 +51,7 @@ def _validate_callback_url(url: str) -> bool:
    """
    try:
        parsed = urlparse(url)
-    except Exception:
+    except (ValueError, TypeError):
        return False
    if parsed.scheme not in ("http", "https"):
@ -159,7 +159,7 @@ class TaskDispatcher:
            except TaskDispatchError:
                raise
-            except Exception as e:
+            except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError) as e:
                await db.rollback()
                logger.error(f"Failed to dispatch task {task.task_id}: {e}")
                raise TaskDispatchError(task.task_id, str(e))
@ -197,7 +197,7 @@ class TaskDispatcher:
            except TaskNotFoundError:
                raise
-            except Exception as e:
+            except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError) as e:
                await db.rollback()
                logger.error(f"Failed to cancel task {task_id}: {e}")
                raise
@ -263,7 +263,7 @@ class TaskDispatcher:
                logger.info(f"Task {result.task_id} result handled (status={result.status})")
-            except Exception as e:
+            except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError) as e:
                await db.rollback()
                logger.error(f"Failed to handle result for task {result.task_id}: {e}")
@ -295,7 +295,7 @@ class TaskDispatcher:
                )
                await db.commit()
-            except Exception as e:
+            except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError) as e:
                await db.rollback()
                logger.error(f"Failed to handle progress for task {progress.task_id}: {e}")
@ -359,7 +359,7 @@ class TaskDispatcher:
                if retried > 0:
                    logger.info(f"Retried {retried} failed tasks")
-            except Exception as e:
+            except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError) as e:
                await db.rollback()
                logger.error(f"Failed to retry failed tasks: {e}")
@ -392,7 +392,7 @@ class TaskDispatcher:
            async with httpx.AsyncClient(timeout=10) as client:
                await client.post(callback_url, json=result.to_dict())
                logger.info(f"Callback triggered for task {result.task_id}")
-        except Exception as e:
+        except (ConnectionError, OSError, asyncio.TimeoutError, RuntimeError) as e:
            logger.warning(f"Callback failed for task {result.task_id}: {e}")
    def _task_to_dict(self, task: Any) -> dict:
--- a/src/agentkit/core/orchestrator.py
+++ b/src/agentkit/core/orchestrator.py
@ -12,7 +12,8 @@ from dataclasses import dataclass, field
 from enum import Enum
 from typing import TYPE_CHECKING, Any
-from agentkit.core.protocol import TaskMessage, TaskResult, TaskStatus
+from agentkit.core.exceptions import LLMProviderError
 from agentkit.core.protocol import TaskMessage, TaskStatus
 from agentkit.core.shared_workspace import SharedWorkspace
 if TYPE_CHECKING:
@ -224,7 +225,7 @@ class Orchestrator:
                        subtasks=subtasks,
                        parallel_groups=parallel_groups,
                    )
-            except Exception as e:
+            except (RuntimeError, ValueError, KeyError, AttributeError) as e:
                logger.warning(f"GoalPlanner decomposition failed, falling back: {e}")
        # If LLM gateway available, use it for decomposition
@ -239,7 +240,7 @@ class Orchestrator:
                        subtasks=subtasks,
                        parallel_groups=parallel_groups,
                    )
-            except Exception as e:
+            except (LLMProviderError, asyncio.TimeoutError, ConnectionError, ValueError, TypeError, KeyError) as e:
                logger.warning(f"LLM decomposition failed, falling back to simple: {e}")
        # Fallback: single subtask = original task
@ -418,7 +419,7 @@ class Orchestrator:
                            "status": "completed",
                        },
                    ))
-                except Exception as e:
+                except (ConnectionError, RuntimeError, OSError) as e:
                    logger.warning(f"Failed to publish progress via MessageBus: {e}")
            return output
@ -437,10 +438,12 @@ class Orchestrator:
                            "error": "Subtask timed out",
                        },
                    ))
-                except Exception as e:
+                except (ConnectionError, RuntimeError, OSError) as e:
                    logger.warning(f"Failed to publish progress via MessageBus: {e}")
            return error_result
-        except Exception as e:
+        except asyncio.CancelledError:
            raise
        except (RuntimeError, ValueError, KeyError, AttributeError, ConnectionError, LLMProviderError) as e:
            error_result = {"status": "failed", "error": str(e)}
            if self._message_bus is not None:
                try:
@ -455,7 +458,7 @@ class Orchestrator:
                            "error": str(e),
                        },
                    ))
-                except Exception as e:
+                except (ConnectionError, RuntimeError, OSError) as e:
                    logger.warning(f"Failed to publish progress via MessageBus: {e}")
            return error_result
@ -513,7 +516,7 @@ class Orchestrator:
        try:
            agents_info = self._agent_pool.list_agents()
            return [a["name"] for a in agents_info]
-        except Exception:
+        except (RuntimeError, KeyError, AttributeError):
            return []
    def _convert_execution_plan_to_subtasks(
@ -561,7 +564,7 @@ class Orchestrator:
                    description = agent.get("description", "").lower()
                    if skill.lower() in name.lower() or skill.lower() in agent_type.lower() or skill.lower() in description:
                        return name
-        except Exception:
+        except (RuntimeError, KeyError, AttributeError):
            pass
        return None
@ -580,9 +583,6 @@ class Orchestrator:
        Returns:
            OrchestrationResult: 编排结果，metadata 中包含迭代历史
        """
        import time as _time
        start_time = _time.monotonic()
        iteration_history: list[dict[str, Any]] = []
        # First execution
@ -650,7 +650,7 @@ class Orchestrator:
        try:
            return await self._llm_evaluate(task, result)
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, ValueError, RuntimeError) as e:
            logger.warning(f"LLM evaluation failed, falling back to rule-based: {e}")
            return self._rule_based_evaluate(result)
--- a/src/agentkit/core/plan_exec_engine.py
+++ b/src/agentkit/core/plan_exec_engine.py
@ -18,7 +18,7 @@ from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from typing import TYPE_CHECKING, Any, Awaitable, Callable
-from agentkit.core.exceptions import TaskCancelledError, TaskTimeoutError
+from agentkit.core.exceptions import LLMProviderError, TaskCancelledError, TaskTimeoutError
 from agentkit.core.goal_planner import GoalPlanner
 from agentkit.core.plan_executor import PlanExecutor, PlanExecutionResult
 from agentkit.core.plan_schema import ExecutionPlan, PlanStep, PlanStepStatus
@ -214,7 +214,7 @@ class PlanExecEngine:
                        system_prompt += f"\n\n## 参考信息\n{memory_context}"
                    else:
                        system_prompt = f"## 参考信息\n{memory_context}"
-            except Exception as e:
+            except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
                logger.warning(f"Memory retrieval failed, continuing without context: {e}")
        # 启动轨迹记录
@ -440,7 +440,7 @@ class PlanExecEngine:
                        value={"output_summary": summary, "agent_name": agent_name},
                        metadata={"task_type": task_type, "outcome": trace_outcome},
                    )
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, ValueError) as e:
                    logger.warning(f"Failed to store task result in episodic memory: {e}")
    # ------------------------------------------------------------------
@ -477,7 +477,7 @@ class PlanExecEngine:
                        system_prompt += f"\n\n## 参考信息\n{memory_context}"
                    else:
                        system_prompt = f"## 参考信息\n{memory_context}"
-            except Exception as e:
+            except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
                logger.warning(f"Memory retrieval failed, continuing without context: {e}")
        # 启动轨迹记录
@ -514,7 +514,7 @@ class PlanExecEngine:
                        "goal": plan.goal,
                        "steps": [s.to_dict() for s in plan.steps],
                    })
-                except Exception as e:
+                except (RuntimeError, ValueError, TypeError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError) as e:
                    logger.warning(f"Step event callback failed: {e}")
            trajectory.append(ReActStep(
@ -535,7 +535,7 @@ class PlanExecEngine:
                            "goal": spec.goal,
                            "num_steps": len(spec.steps),
                        })
-                    except Exception as e:
+                    except (RuntimeError, ValueError, TypeError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError) as e:
                        logger.warning(f"Step event callback failed: {e}")
            if trace_recorder is not None:
@ -604,7 +604,7 @@ class PlanExecEngine:
                        value={"output_summary": summary, "agent_name": agent_name},
                        metadata={"task_type": task_type, "outcome": trace_outcome},
                    )
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, ValueError) as e:
                    logger.warning(f"Failed to store task result in episodic memory: {e}")
    async def _execute_with_replanning(
@ -685,7 +685,7 @@ class PlanExecEngine:
                            "result": step_result.result,
                            "error": step_result.error,
                        })
-                    except Exception as e:
+                    except (RuntimeError, ValueError, TypeError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError) as e:
                        logger.warning(f"Step event callback failed: {e}")
                if trace_recorder is not None:
@ -733,7 +733,7 @@ class PlanExecEngine:
                            "root_cause": reflection_report.root_cause,
                            "new_plan_id": current_plan.plan_id,
                        })
-                    except Exception as e:
+                    except (RuntimeError, ValueError, TypeError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError) as e:
                        logger.warning(f"Step event callback failed: {e}")
                trajectory.append(ReActStep(
--- a/src/agentkit/core/react.py
+++ b/src/agentkit/core/react.py
@ -15,7 +15,7 @@ from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from typing import TYPE_CHECKING, Any
-from agentkit.core.exceptions import LoopDetectedError, TaskCancelledError, TaskTimeoutError
+from agentkit.core.exceptions import LLMProviderError, LoopDetectedError, TaskCancelledError, TaskTimeoutError
 from agentkit.core.protocol import CancellationToken
 from agentkit.llm.gateway import LLMGateway
 from agentkit.llm.protocol import LLMResponse
@ -659,7 +659,8 @@ class ReActEngine:
                        )
                        or ""
                    )
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
                    # 检索层故障（RAG/Redis/LLM embedding）— 不阻塞主流程
                    logger.warning(
                        f"Memory retrieval failed, continuing without context: {e}", exc_info=True
                    )
@ -679,7 +680,8 @@ class ReActEngine:
            if compressor:
                try:
                    conversation = await compressor.compress(conversation)
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
                    # 压缩器通常调用 LLM — LLM 不可用类异常降级为原对话
                    logger.warning(
                        f"Context compression failed, continuing with original messages: {e}"
                    )
@ -1052,7 +1054,11 @@ class ReActEngine:
                                        approved = await confirmation_handler(
                                            confirmation_id, command, reason
                                        )
                                    except asyncio.CancelledError:
                                        raise
                                    except Exception as e:
                                        # 用户提供的 confirmation_handler — 任意异常都可能，
                                        # 不阻塞主循环，降级为未批准
                                        logger.warning(f"Confirmation handler error: {e}")
                                if approved:
@ -1066,9 +1072,10 @@ class ReActEngine:
                                        clean_args["_skip_dangerous_check"] = True
                                        try:
                                            tool_result = await tool.safe_execute(**clean_args)
-                                        except Exception as e:
+                                        except (ToolValidationError, ValueError, TypeError, RuntimeError) as e:
                                            tool_result = {
-                                                "error": f"Tool '{tc.name}' execution failed: {e}"
+                                                "error": f"Tool '{tc.name}' execution failed: {e}",
                                                "error_code": "tool_execution_failed",
                                            }
                                    else:
                                        clean_args = {
@ -1083,9 +1090,10 @@ class ReActEngine:
                                                if tool
                                                else {"error": f"Tool '{tc.name}' not found"}
                                            )
-                                        except Exception as e:
+                                        except (ToolValidationError, ValueError, TypeError, RuntimeError) as e:
                                            tool_result = {
-                                                "error": f"Tool '{tc.name}' execution failed: {e}"
+                                                "error": f"Tool '{tc.name}' execution failed: {e}",
                                                "error_code": "tool_execution_failed",
                                            }
                                yield ReActEvent(
@ -1146,7 +1154,7 @@ class ReActEngine:
                    if self._should_compress(conversation, compressor):
                        try:
                            conversation = await compressor.compress(conversation)
-                        except Exception as e:
+                        except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
                            logger.warning(f"Incremental compression failed: {e}")
                else:
@ -1217,7 +1225,7 @@ class ReActEngine:
                        if self._should_compress(conversation, compressor):
                            try:
                                conversation = await compressor.compress(conversation)
-                            except Exception as e:
+                            except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
                                logger.warning(f"Incremental compression failed: {e}")
                    else:
                        # ponytail: 检查是否为畸形工具调用（含 <tool_use> 但解析失败）
@ -1332,7 +1340,7 @@ class ReActEngine:
                                        reinjections,
                                    )
                                    break
-                            except Exception as e:
+                            except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
                                logger.warning(f"Verification loop failed: {e}")
                        # Yield final_answer event (legacy format for execute_stream consumers)
@ -1428,7 +1436,8 @@ class ReActEngine:
                        value={"output_summary": summary, "agent_name": agent_name},
                        metadata={"task_type": task_type, "outcome": trace_outcome},
                    )
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, ValueError) as e:
                    # EpisodicMemory 持久化故障（PG/Redis）— 不影响主结果
                    logger.warning(f"Failed to store task result in episodic memory: {e}")
    async def execute_stream(
@ -1555,7 +1564,7 @@ class ReActEngine:
        """通过 gateway 查询 model 对应的 provider 名。失败回退 None(字符串拼接)。"""
        try:
            return self._llm_gateway.get_provider_name_for_model(model)
-        except Exception:
+        except (AttributeError, KeyError, LLMProviderError):
            # ponytail: 测试中 gateway 可能是 MagicMock,无该方法;回退保守路径
            return None
@ -1723,7 +1732,7 @@ class ReActEngine:
        if compressor and tool_name:
            try:
                content = await compressor.compress_tool_result(tool_name, result)
-            except Exception as e:
+            except (asyncio.TimeoutError, ConnectionError, LLMProviderError, RuntimeError) as e:
                logger.warning(f"Tool result compression failed for '{tool_name}': {e}")
                content = str(result)
        return {
@ -1771,10 +1780,11 @@ class ReActEngine:
                "error_code": e.error_code,
                "details": e.details,
            }
-        except Exception as e:
+        except (ValueError, TypeError, RuntimeError, asyncio.TimeoutError) as e:
            # 工具执行失败 — 记录结构化错误码,LLM 可在下一步调整策略
            error_msg = f"Tool '{tool_name}' execution failed: {e}"
            logger.warning(error_msg)
-            return {"error": error_msg}
+            return {"error": error_msg, "error_code": "tool_execution_failed"}
    async def _execute_tool_with_confirmation(
        self,
@ -1818,7 +1828,10 @@ class ReActEngine:
            if confirmation_handler is not None:
                try:
                    approved = await confirmation_handler(confirmation_id, command, reason)
                except asyncio.CancelledError:
                    raise
                except Exception as e:
                    # 用户提供的 confirmation_handler — 任意异常都可能，不阻塞主循环
                    logger.warning(f"Confirmation handler error: {e}")
            if approved:
@ -1829,8 +1842,11 @@ class ReActEngine:
                    clean_args["_skip_dangerous_check"] = True
                    try:
                        tool_result = await tool.safe_execute(**clean_args)
-                    except Exception as e:
+                    except (ToolValidationError, ValueError, TypeError, RuntimeError) as e:
-                        tool_result = {"error": f"Tool '{tc.name}' execution failed: {e}"}
+                        tool_result = {
                            "error": f"Tool '{tc.name}' execution failed: {e}",
                            "error_code": "tool_execution_failed",
                        }
                else:
                    # Non-dangerous tool: re-execute with skip flag
                    clean_args = {k: v for k, v in tc.arguments.items() if not k.startswith("_")}
@ -1841,7 +1857,7 @@ class ReActEngine:
                            if tool
                            else {"error": f"Tool '{tc.name}' not found"}
                        )
-                    except Exception as e:
+                    except (ToolValidationError, ValueError, TypeError, RuntimeError) as e:
                        tool_result = {"error": f"Tool '{tc.name}' execution failed: {e}"}
                events.append(
--- a/src/agentkit/core/rewoo.py
+++ b/src/agentkit/core/rewoo.py
@ -11,23 +11,21 @@ import logging
 import re
 import time
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from typing import TYPE_CHECKING, Any
-from agentkit.core.exceptions import TaskCancelledError, TaskTimeoutError
+from agentkit.core.exceptions import LLMProviderError, TaskCancelledError, TaskTimeoutError
 from agentkit.core.protocol import CancellationToken
 from agentkit.core.react import ReActEngine, ReActEvent, ReActResult, ReActStep
 from agentkit.llm.gateway import LLMGateway
-from agentkit.llm.protocol import LLMResponse
+from agentkit.tools.base import Tool, ToolValidationError
-from agentkit.tools.base import Tool
+from agentkit.telemetry.tracing import start_span, _OTEL_AVAILABLE
 from agentkit.telemetry.tracing import get_tracer, start_span, _OTEL_AVAILABLE
 from agentkit.telemetry.metrics import (
    agent_request_counter,
    agent_duration_histogram,
 )
 if TYPE_CHECKING:
-    from agentkit.core.compressor import CompressionStrategy, ContextCompressor
+    from agentkit.core.compressor import CompressionStrategy
    from agentkit.core.trace import TraceRecorder
    from agentkit.memory.retriever import MemoryRetriever
@ -296,7 +294,7 @@ class ReWOOEngine:
                            effective_system_prompt += f"\n\n## 参考信息\n{memory_context}"
                        else:
                            effective_system_prompt = f"## 参考信息\n{memory_context}"
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
                    logger.warning(f"Memory retrieval failed, continuing without context: {e}")
            # ── Phase 1: Planning ──
@ -360,7 +358,7 @@ class ReWOOEngine:
                if compressor:
                    try:
                        llm_messages = await compressor.compress(llm_messages)
-                    except Exception as e:
+                    except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
                        logger.warning(f"Context compression failed: {e}")
                response = await self._llm_gateway.chat(
@ -492,7 +490,7 @@ class ReWOOEngine:
                        value={"output_summary": summary, "agent_name": agent_name},
                        metadata={"task_type": task_type, "outcome": trace_outcome},
                    )
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, ValueError) as e:
                    logger.warning(f"Failed to store task result in episodic memory: {e}")
            return ReActResult(
@ -569,7 +567,7 @@ class ReWOOEngine:
                        effective_system_prompt += f"\n\n## 参考信息\n{memory_context}"
                    else:
                        effective_system_prompt = f"## 参考信息\n{memory_context}"
-            except Exception as e:
+            except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
                logger.warning(f"Memory retrieval failed, continuing without context: {e}")
        trajectory: list[ReActStep] = []
@ -647,7 +645,7 @@ class ReWOOEngine:
                if compressor:
                    try:
                        llm_messages = await compressor.compress(llm_messages)
-                    except Exception as e:
+                    except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
                        logger.warning(f"Context compression failed: {e}")
                response = await self._llm_gateway.chat(
@ -769,6 +767,9 @@ class ReWOOEngine:
                    "total_tokens": total_tokens,
                },
            )
        except asyncio.CancelledError:
            trace_outcome = "cancelled"
            raise
        except Exception as e:
            trace_outcome = "error"
            logger.error(f"ReWOO execute_stream failed: {e}")
@ -786,7 +787,7 @@ class ReWOOEngine:
                        value={"output_summary": summary, "agent_name": agent_name},
                        metadata={"task_type": task_type, "outcome": trace_outcome},
                    )
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, ValueError) as e:
                    logger.warning(f"Failed to store task result in episodic memory: {e}")
    # ── Fallback Strategy Helpers ──────────────────────────
@ -914,7 +915,7 @@ class ReWOOEngine:
                output, synthesis_tokens = await self._synthesis_phase(messages=messages, tool_results=tool_results, model=model, agent_name=agent_name, task_type=task_type, system_prompt=effective_system_prompt, compressor=compressor, cancellation_token=cancellation_token)
                yield ReActEvent(event_type="final_answer", step=len(plan.steps) + 1, data={"output": output, "total_steps": len(plan.steps) + 1, "total_tokens": simplified_tokens + synthesis_tokens})
                return
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError, TypeError, ToolValidationError, json.JSONDecodeError) as e:
            logger.warning(f"Simplified ReWOO planning also failed in stream mode: {e}")
        # Failed, continue to next strategy by not returning
        # This signals the caller to try the next strategy
@ -951,7 +952,7 @@ class ReWOOEngine:
            ):
                yield event
            return
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ToolValidationError) as e:
            logger.warning(f"ReAct fallback also failed in stream mode: {e}")
        raise _FallbackFailedError("react")
@ -975,13 +976,13 @@ class ReWOOEngine:
            if compressor:
                try:
                    direct_messages = await compressor.compress(direct_messages)
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
                    logger.warning(f"Context compression failed in direct fallback: {e}")
            direct_response = await self._llm_gateway.chat(messages=direct_messages, model=model, agent_name=agent_name, task_type=task_type)
            output = direct_response.content or ""
            yield ReActEvent(event_type="final_answer", step=1, data={"output": output, "total_steps": 1, "total_tokens": total_tokens + direct_response.usage.total_tokens})
            return
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as e:
            logger.error(f"Direct LLM fallback also failed in stream mode: {e}")
        raise _FallbackFailedError("direct")
@ -1024,7 +1025,7 @@ class ReWOOEngine:
                output, synthesis_tokens = await self._synthesis_phase(messages=messages, tool_results=tool_results, model=model, agent_name=agent_name, task_type=task_type, system_prompt=effective_system_prompt, compressor=compressor, cancellation_token=cancellation_token)
                yield ReActEvent(event_type="final_answer", step=len(plan.steps) + 1, data={"output": output, "total_steps": len(plan.steps) + 1, "total_tokens": plan_tokens + synthesis_tokens})
                return
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError, TypeError, ToolValidationError, json.JSONDecodeError) as e:
            logger.warning(f"Plan-exec fallback also failed in stream mode: {e}")
        raise _FallbackFailedError("plan_exec")
@ -1178,7 +1179,7 @@ class ReWOOEngine:
                    total_tokens=total_tokens,
                    fallback_strategy="simplified_rewoo",
                )
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError, TypeError, ToolValidationError, json.JSONDecodeError) as e:
            logger.warning(f"Simplified ReWOO planning also failed: {e}")
        return None
@ -1219,7 +1220,7 @@ class ReWOOEngine:
            )
            react_result.fallback_strategy = "react"
            return react_result
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ToolValidationError) as e:
            logger.warning(f"ReAct fallback also failed: {e}")
            return None
@ -1247,7 +1248,7 @@ class ReWOOEngine:
            if compressor:
                try:
                    direct_messages = await compressor.compress(direct_messages)
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
                    logger.warning(f"Context compression failed in direct fallback: {e}")
            direct_response = await self._llm_gateway.chat(
@ -1284,7 +1285,7 @@ class ReWOOEngine:
                total_tokens=total_tokens,
                fallback_strategy="direct",
            )
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as e:
            logger.error(f"Direct LLM fallback also failed: {e}")
            return None
@ -1361,7 +1362,7 @@ class ReWOOEngine:
                    total_tokens=total_tokens,
                    fallback_strategy="plan_exec",
                )
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError, TypeError, ToolValidationError, json.JSONDecodeError) as e:
            logger.warning(f"Plan-exec fallback also failed: {e}")
        return None
@ -1418,7 +1419,7 @@ class ReWOOEngine:
        if compressor:
            try:
                planning_messages = await compressor.compress(planning_messages)
-            except Exception as e:
+            except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
                logger.warning(f"Context compression failed during planning: {e}")
        try:
@ -1429,7 +1430,7 @@ class ReWOOEngine:
                task_type=task_type,
                tools=tool_schemas,
            )
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError) as e:
            logger.warning(f"LLM call failed during planning: {e}")
            return None, 0
@ -1496,7 +1497,7 @@ class ReWOOEngine:
        if compressor:
            try:
                synthesis_messages = await compressor.compress(synthesis_messages)
-            except Exception as e:
+            except (asyncio.TimeoutError, ConnectionError, LLMProviderError) as e:
                logger.warning(f"Context compression failed during synthesis: {e}")
        response = await self._llm_gateway.chat(
@ -1611,7 +1612,7 @@ class ReWOOEngine:
        try:
            result = await tool.safe_execute(**arguments)
            return result
-        except Exception as e:
+        except (ToolValidationError, ValueError, TypeError, RuntimeError) as e:
            error_msg = f"Tool '{tool_name}' execution failed: {e}"
            logger.warning(error_msg)
            return {"error": error_msg}
--- a/src/agentkit/experts/_phase_executor.py
+++ b/src/agentkit/experts/_phase_executor.py
@ -5,6 +5,7 @@
 from __future__ import annotations
 import asyncio
 import copy
 import logging
 from datetime import datetime, timezone
@ -17,8 +18,6 @@ from .expert import Expert
 from .plan import PhaseStatus, PhaseType, PlanPhase, TeamPlan
 if TYPE_CHECKING:
    import asyncio
    from .team import ExpertTeam
 logger = logging.getLogger(__name__)
@ -61,7 +60,7 @@ class PhaseExecutorMixin:
                    full_data = await self._team.workspace.read(ref_key)
                    if full_data:
                        return full_data.get("value", content)
-                except Exception as e:
+                except (asyncio.TimeoutError, ConnectionError, KeyError, AttributeError) as e:
                    logger.warning(f"Failed to read offloaded output '{ref_key}': {e}")
        return content
@ -80,11 +79,11 @@ class PhaseExecutorMixin:
        try:
            # U3: 返工循环 — 最多 MAX_REWORKS + 1 次（1 次初始 + MAX_REWORKS 次返工）
            for _rework_attempt in range(self.MAX_REWORKS + 1):
-                result, last_error, passed, feedback = await self._run_agent_steps(
+                result, last_error, passed, feedback, degraded = await self._run_agent_steps(
                    expert, agent, lead, phase, plan
                )
                done = await self._finalize_phase(
-                    expert, lead, phase, plan, result, passed, feedback
+                    expert, lead, phase, plan, result, passed, feedback, degraded
                )
                if done:
                    return result
@ -181,9 +180,10 @@ class PhaseExecutorMixin:
        lead: Expert,
        phase: PlanPhase,
        plan: TeamPlan,
-    ) -> tuple[dict[str, Any], str | None, bool, str]:
+    ) -> tuple[dict[str, Any], str | None, bool, str, bool]:
        """Run one rework iteration: read deps, build input, execute, review. Returns
-        (result, last_error, passed, feedback). Raises RuntimeError on retry exhaustion."""
+        (result, last_error, passed, feedback, degraded). Raises RuntimeError on retry
        exhaustion."""
        # 每次迭代重新读取依赖输出（前置阶段可能在返工期间完成）
        dependency_outputs: dict[str, Any] = {}
        for dep_id in phase.depends_on:
@ -228,7 +228,12 @@ class PhaseExecutorMixin:
                    raise RuntimeError(f"Agent execution failed: {last_error}")
                result = task_result.output_data or {"content": ""}
                break
-            except Exception as e:
+            except asyncio.CancelledError:
                # CancelledError 必须传播，不被重试逻辑吞掉
                raise
            except (RuntimeError, asyncio.TimeoutError, ConnectionError) as e:
                # agent.execute() 内部已捕获所有异常并返回 TaskResult，
                # 此处仅捕获显式抛出的 RuntimeError + 罕见的基础设施异常
                last_error = str(e)
                if attempt < self.MAX_RETRIES:
                    logger.info(f"Retrying phase {phase.id} (attempt {attempt + 1})")
@ -250,9 +255,9 @@ class PhaseExecutorMixin:
            "risk_description": risk_desc, "phase_id": phase.id, "phase_name": phase.name,
        })
-        # U3: Lead 验收阶段输出
+        # U3: Lead 验收阶段输出 — ReviewResult 结构化结果（含 degraded 标记）
-        passed, feedback = await self._review_phase_output(lead, phase, result)
+        review = await self._review_phase_output(lead, phase, result)
-        return result, last_error, passed, feedback
+        return result, last_error, review.passed, review.feedback, review.degraded
    async def _finalize_phase(
        self,
@ -263,9 +268,15 @@ class PhaseExecutorMixin:
        result: dict[str, Any],
        passed: bool,
        feedback: str,
        degraded: bool = False,
    ) -> bool:
        """Handle review outcome: write workspace + emit completed, or rework/fail. Returns
-        True if done (COMPLETED), False if rework continues. Raises on rework limit."""
+        True if done (COMPLETED), False if rework continues. Raises on rework limit.
        Args:
            degraded: True 表示验收走了降级路径（LLM 不可用/超时/异常时自动通过），
                广播到 ``review_result`` 事件 payload 让前端/运维可编程判断。
        """
        if passed:
            phase.status = PhaseStatus.COMPLETED
            # P2: SharedWorkspace 写入移到验收通过后 — 避免持久化被拒输出
@ -276,6 +287,7 @@ class PhaseExecutorMixin:
            await self._broadcast_event("review_result", {
                "phase_id": phase.id, "phase_name": phase.name, "passed": True,
                "feedback": feedback, "expert": phase.assigned_expert,
                "degraded": degraded,
            })
            if phase.collaboration_contracts:
                await self._notify_collaborators(phase, plan)
@ -288,7 +300,7 @@ class PhaseExecutorMixin:
            })
            return True
-        # 验收不合格 — 返工或标记失败
+        # 验收不合格 — 返工或标记失败（degraded 路径不应走到这里，但保持字段一致）
        phase.rework_count += 1
        phase.review_feedback = feedback
@ -304,6 +316,7 @@ class PhaseExecutorMixin:
                    "expert": phase.assigned_expert,
                    "rework_count": phase.rework_count,
                    "final_status": "failed",
                    "degraded": degraded,
                },
            )
            await self._broadcast_event(
@ -329,6 +342,7 @@ class PhaseExecutorMixin:
                "expert": phase.assigned_expert,
                "rework_count": phase.rework_count,
                "final_status": "rework",
                "degraded": degraded,
            },
        )
        feedback_truncated = feedback[:500] if feedback else ""
@ -377,7 +391,8 @@ class PhaseExecutorMixin:
            agent = await pool.create_agent(temp_config)
            self._temp_agents[phase.id] = temp_config.name
            return agent
-        except Exception as e:
+        except (ValueError, KeyError, RuntimeError, TypeError) as e:
            # pool.create_agent 失败：config 校验/工具注册/依赖缺失等
            logger.warning(
                f"Failed to create isolated agent for phase {phase.id}, "
                f"using expert's existing agent: {e}"
@ -393,5 +408,7 @@ class PhaseExecutorMixin:
        if temp_name:
            try:
                await pool.remove_agent(temp_name)
-            except Exception as e:
+            except asyncio.CancelledError:
                raise
            except (KeyError, RuntimeError) as e:
                logger.warning(f"Failed to clean up isolated agent '{temp_name}': {e}")
--- a/src/agentkit/experts/_review_gate.py
+++ b/src/agentkit/experts/_review_gate.py
@ -5,11 +5,15 @@
 from __future__ import annotations
 import asyncio
 import json
 import logging
 import re
 from dataclasses import dataclass
 from typing import Any
 from agentkit.core.exceptions import LLMProviderError
 from .expert import Expert
 from .plan import PlanPhase
@ -19,27 +23,46 @@ logger = logging.getLogger(__name__)
 _RISK_FLAG_RE = re.compile(r"\[RISK:\s*(.+?)\]", re.DOTALL)
@dataclass
 class ReviewResult:
    """Lead 验收阶段输出的结构化结果（U3）。
    替换原先的 ``tuple[bool, str]`` 返回值，让降级状态可被调用方/前端
    可编程判断，而非依赖 ``[DEGRADED]`` 字符串前缀匹配。
    Attributes:
        passed: 验收是否通过（True=通过，False=需返工）
        degraded: 是否处于降级路径（LLM 不可用/超时/异常时自动通过）
        feedback: 验收反馈；降级时为降级原因，正常通过时为空，需返工时为修改要求
    """
    passed: bool
    degraded: bool = False
    feedback: str = ""
 class ReviewGateMixin:
    """Mixin: Lead 验收阶段输出质量 + 解析风险标记。由 TeamOrchestrator 组合。"""
    async def _review_phase_output(
        self, lead: Expert, phase: PlanPhase, result: dict[str, Any]
-    ) -> tuple[bool, str]:
+    ) -> ReviewResult:
        """Lead 验收阶段输出质量。
-        用 LLM 判断输出是否满足阶段要求。
+        用 LLM 判断输出是否满足阶段要求。返回 :class:`ReviewResult`：
-        返回 (passed, feedback)：
+        - ``passed=True, degraded=False`` — 验收通过
-        - passed=True, feedback="" — 验收通过
+        - ``passed=False, feedback="修改要求"`` — 验收不合格，需返工
-        - passed=False, feedback="修改要求" — 验收不合格，需返工
+        - ``passed=True, degraded=True`` — LLM 不可用/超时/异常，优雅降级自动通过
-        若 LLM 不可用，跳过验收直接通过（优雅降级，feedback 标注降级原因）。
+        降级路径以 ``degraded=True`` 显式标记，让 ``review_result`` WS 事件
        和日志聚合可编程判断降级频率，无需匹配 ``[DEGRADED]`` 字符串前缀。
        """
        gateway = self._get_llm_gateway(lead)
        if not gateway:
            logger.warning("No LLM gateway available, skipping review")
-            # 优雅降级：不阻塞流程，但 [DEGRADED] 前缀让 review_result 事件
+            return ReviewResult(
-            # 和日志聚合可识别降级路径，便于运维监控验收失效频率。
+                passed=True, degraded=True, feedback="LLM 验收不可用，自动通过"
-            return True, "[DEGRADED] LLM 验收不可用，自动通过"
+            )
        content = result.get("content", str(result))
        # P1: prompt injection 防护 — 用 XML 标签包裹专家输出，指示 LLM 忽略其中指令
@ -60,32 +83,42 @@ class ReviewGateMixin:
                messages=[{"role": "user", "content": prompt}],
                model=self._get_model(lead),
            )
-            # P2: 优先尝试直接解析整个响应为 JSON，避免贪婪正则匹配过多
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, RuntimeError) as e:
-            review: dict[str, Any] | None = None
+            # LLM 不可用类异常 — 优雅降级，不阻塞流程。
-            try:
+            # ponytail: RuntimeError 纳入捕获 — LiteLLM/provider 内部错误常以 RuntimeError
-                review = json.loads(response.content)
+            # 抛出（如 "LLM unavailable"），验收路径语义是"LLM 调用失败即降级"，需覆盖。
-            except (json.JSONDecodeError, TypeError):
+            logger.warning(f"Review LLM call failed, degrading: {e}")
-                pass
+            return ReviewResult(
-            if review is None:
+                passed=True, degraded=True, feedback=f"LLM 验收降级，自动通过: {e}"
-                # 回退到正则提取第一个 JSON 对象
+            )
                json_match = re.search(r"\{[^{}]*\}", response.content, re.DOTALL)
                if json_match:
                    try:
                        review = json.loads(json_match.group(0))
                    except json.JSONDecodeError:
                        pass
            if review is not None:
                # ponytail: 显式比较避免 bool("false") == True 陷阱
                passed_raw = review.get("passed", True)
                passed = passed_raw is True or str(passed_raw).lower() == "true"
                feedback = review.get("feedback", "")
                return passed, str(feedback)
            logger.warning(f"Review LLM returned unparseable response: {response.content[:200]}")
        except Exception as e:
            logger.warning(f"Review LLM call failed: {e}")
-        # 降级：不阻塞流程，但 [DEGRADED] 前缀让 review_result 事件可识别降级路径
+        # P2: 优先尝试直接解析整个响应为 JSON，避免贪婪正则匹配过多
-        return True, "[DEGRADED] LLM 验收降级，自动通过"
+        review: dict[str, Any] | None = None
        try:
            review = json.loads(response.content)
        except (json.JSONDecodeError, TypeError):
            pass
        if review is None:
            # 回退到正则提取第一个 JSON 对象
            json_match = re.search(r"\{[^{}]*\}", response.content, re.DOTALL)
            if json_match:
                try:
                    review = json.loads(json_match.group(0))
                except json.JSONDecodeError:
                    pass
        if review is not None:
            # ponytail: 显式比较避免 bool("false") == True 陷阱
            passed_raw = review.get("passed", True)
            passed = passed_raw is True or str(passed_raw).lower() == "true"
            feedback = review.get("feedback", "")
            return ReviewResult(passed=passed, feedback=str(feedback))
        # 现有行为：LLM 返回不可解析响应时也走降级通过（plan 文档 line 274 标注
        # passed=False，但实际生产行为是降级通过避免阻塞流水线 — 以现有行为为准）。
        logger.warning(f"Review LLM returned unparseable response: {response.content[:200]}")
        return ReviewResult(
            passed=True, degraded=True, feedback="LLM 验收响应不可解析，自动通过"
        )
    @staticmethod
    def _parse_risk_flags(content: str) -> list[str]:
--- a/src/agentkit/experts/orchestrator.py
+++ b/src/agentkit/experts/orchestrator.py
@ -16,6 +16,7 @@ import logging
 import re
 from typing import Any
 from agentkit.core.exceptions import LLMProviderError
 from agentkit.llm.gateway import LLMGateway
 from ._debate_runner import DebateRunnerMixin
@ -169,7 +170,7 @@ class TeamOrchestrator(
        if self._checkpoint is not None:
            try:
                await self._checkpoint.save_plan(plan)
-            except Exception as e:
+            except (ConnectionError, OSError, asyncio.TimeoutError, RuntimeError, ValueError, KeyError) as e:
                logger.warning(f"Checkpoint save_plan failed: {e}")
        # 4. Set EXECUTING status, execute phases
@ -266,7 +267,7 @@ class TeamOrchestrator(
                    if should_save_checkpoint and self._checkpoint is not None:
                        try:
                            await self._checkpoint.save(plan.id, ph, plan.status.value)
-                        except Exception as e:
+                        except (ConnectionError, OSError, asyncio.TimeoutError, RuntimeError, ValueError, KeyError) as e:
                            logger.warning(f"Checkpoint save failed for phase {ph.id}: {e}")
                # U3: Divergence detection — check completed phases for conflicts
@ -310,7 +311,7 @@ class TeamOrchestrator(
            if self._checkpoint is not None:
                try:
                    await self._checkpoint.clear(plan.id)
-                except Exception as e:
+                except (ConnectionError, OSError, asyncio.TimeoutError, RuntimeError, ValueError, KeyError) as e:
                    logger.warning(f"Checkpoint clear failed: {e}")
            return {
@ -326,7 +327,9 @@ class TeamOrchestrator(
            plan.status = PlanStatus.FAILED
            await self._broadcast_event("team_dissolved", {"team_id": self._team.team_id})
            return await self._fallback_to_single_agent(task, plan, phase_results)
-        except Exception as e:
+        except asyncio.CancelledError:
            raise
        except (RuntimeError, ValueError, KeyError, AttributeError, ConnectionError, asyncio.TimeoutError, LLMProviderError) as e:
            logger.error(f"Pipeline execution failed: {e}")
            plan.status = PlanStatus.FAILED
            await self._broadcast_event("team_dissolved", {"team_id": self._team.team_id})
@ -463,7 +466,7 @@ class TeamOrchestrator(
            if phases:
                return phases
            logger.warning("LLM decomposition returned no valid phases")
-        except Exception as e:
+        except (LLMProviderError, asyncio.TimeoutError, ConnectionError, json.JSONDecodeError, ValueError, TypeError) as e:
            logger.warning(f"LLM task decomposition failed: {e}")
        return [PlanPhase(name="执行", assigned_expert=lead.config.name, task_description=task)]
@ -588,5 +591,5 @@ class TeamOrchestrator(
                await self._team.handoff_transport.send(
                    self._team.team_channel, {"type": event_type, **data}
                )
-            except Exception as e:
+            except (ConnectionError, RuntimeError, OSError, asyncio.TimeoutError) as e:
                logger.warning(f"Failed to broadcast event '{event_type}': {e}")
--- a/src/agentkit/orchestrator/pipeline_engine.py
+++ b/src/agentkit/orchestrator/pipeline_engine.py
@ -20,7 +20,7 @@ from agentkit.orchestrator.pipeline_schema import (
    StageStatus,
 )
 from agentkit.orchestrator.reflection import PipelineReflector, PipelineReplanner
-from agentkit.orchestrator.retry import StepRetryPolicy, execute_with_retry
+from agentkit.orchestrator.retry import execute_with_retry
 logger = logging.getLogger(__name__)
@ -143,7 +143,7 @@ class PipelineEngine:
                    steps=step_names,
                    input_data=context,
                )
-            except Exception as exc:
+            except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as exc:
                logger.warning(f"Failed to create execution state: {exc}")
        # Create Saga orchestrator for compensation tracking
@ -183,7 +183,7 @@ class PipelineEngine:
                            output=step_output,
                            error=step_error,
                        )
-                    except Exception as exc:
+                    except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as exc:
                        logger.warning(f"Failed to update step state: {exc}")
                # 收集输出变量
@ -219,7 +219,7 @@ class PipelineEngine:
                                    step_name=stage.name,
                                    error=result.error_message,
                                )
-                            except Exception as exc:
+                            except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as exc:
                                logger.warning(f"Failed to persist failure state: {exc}")
                        return result
@ -237,7 +237,7 @@ class PipelineEngine:
                    execution_id=execution_id,
                    final_output=final_output,
                )
-            except Exception as exc:
+            except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as exc:
                logger.warning(f"Failed to persist completion state: {exc}")
        return result
@ -346,7 +346,11 @@ class PipelineEngine:
            return sr
-        except Exception as e:
+        except asyncio.CancelledError:
            raise
        except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as e:
            # dispatcher / agent 执行失败 — 转 StageResult.FAILED 不向上抛
            return StageResult(
                stage_name=stage.name,
                status=StageStatus.FAILED,
@ -475,7 +479,9 @@ class PipelineEngine:
                    stage,
                    started_at,
                )
-            except Exception as e:
+            except asyncio.CancelledError:
                raise
            except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as e:
                logger.error(f"Verifier execution failed for stage '{stage.name}': {e}")
                return StageResult(
                    stage_name=stage.name,
@ -619,7 +625,9 @@ class PipelineEngine:
                step_name=stage.name,
            )
            return sr
-        except Exception as e:
+        except asyncio.CancelledError:
            raise
        except (asyncio.TimeoutError, ConnectionError, RuntimeError, ValueError) as e:
            return StageResult(
                stage_name=stage.name,
                status=StageStatus.FAILED,
@ -679,7 +687,7 @@ class PipelineEngine:
                score=output_data.get("score", 0.0),
            )
            return feedback
-        except Exception as e:
+        except (TypeError, KeyError, ValueError) as e:
            # 解析失败时直接抛出异常，避免死循环
            logger.error(f"Failed to parse verifier output: {e}")
            raise RuntimeError(
--- a/tests/unit/experts/test_team_orchestrator.py
+++ b/tests/unit/experts/test_team_orchestrator.py
@ -790,10 +790,18 @@ class TestResultSynthesis:
            {"name": "A", "assigned_expert": "member1", "task_description": "阶段A", "depends_on": []},
            {"name": "B", "assigned_expert": "member2", "task_description": "阶段B", "depends_on": []},
        ])
-        # Synthesis call raises to force concatenation fallback
+        # ponytail: 函数式 side_effect — 首次返回 decomposition，后续一律抛 RuntimeError
-        gateway.chat = AsyncMock(
+        # （列表式 side_effect 耗尽会抛 StopIteration，被 U3 收窄后的 except 漏捕获；
-            side_effect=[decomp_response, RuntimeError("LLM unavailable")]
+        #   函数式让"LLM 不可用"语义明确，覆盖验收+综合所有后续调用）
-        )
+        call_count = [0]
        async def chat_side_effect(messages, model=None, **kwargs):
            call_count[0] += 1
            if call_count[0] == 1:
                return decomp_response
            raise RuntimeError("LLM unavailable")
        gateway.chat = AsyncMock(side_effect=chat_side_effect)
        team._experts["lead"].agent._llm_gateway = gateway
        result = await orchestrator.execute("复杂任务")