feat: complex-task-quality-loop (R1-R12) #22
|
|
@ -42,10 +42,23 @@ logger = logging.getLogger(__name__)
|
||||||
# 最大重规划次数
|
# 最大重规划次数
|
||||||
_DEFAULT_MAX_REPLANS = 2
|
_DEFAULT_MAX_REPLANS = 2
|
||||||
|
|
||||||
|
# U8/R8: max replans triggered by spec-review rejection (separate from
|
||||||
|
# _DEFAULT_MAX_REPLANS which covers execution failures). Cap prevents an
|
||||||
|
# infinite reject→replan loop between user and planner.
|
||||||
|
_MAX_SPEC_REVIEW_REPLANS = 2
|
||||||
|
|
||||||
# U4/R11: default phase budgets for PLAN_EXEC. think=7 (exploration),
|
# U4/R11: default phase budgets for PLAN_EXEC. think=7 (exploration),
|
||||||
# verify=2 (two verification attempts), reflect=1 (one re-injection).
|
# verify=2 (two verification attempts), reflect=1 (one re-injection).
|
||||||
_DEFAULT_PHASE_BUDGETS: dict[str, int] = {"think": 7, "verify": 2, "reflect": 1}
|
_DEFAULT_PHASE_BUDGETS: dict[str, int] = {"think": 7, "verify": 2, "reflect": 1}
|
||||||
|
|
||||||
|
# U8/R8: spec review gate handler. Suspends execution after the first Spec
|
||||||
|
# is generated; returns (decision, feedback). decision ∈ {"approved",
|
||||||
|
# "rejected"}. On rejection the engine replans and re-reviews (capped at
|
||||||
|
# _MAX_SPEC_REVIEW_REPLANS). The handler raises asyncio.TimeoutError when
|
||||||
|
# the user does not reply within the boundary timeout (chat.py: 30 min);
|
||||||
|
# the engine then parks the Spec (not failed) so the user can resume.
|
||||||
|
SpecReviewHandler = Callable[[str, str, list[dict[str, Any]]], Awaitable[tuple[str, str]]]
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class _StreamState:
|
class _StreamState:
|
||||||
|
|
@ -56,6 +69,13 @@ class _StreamState:
|
||||||
total_tokens: int = 0
|
total_tokens: int = 0
|
||||||
step_counter: int = 0
|
step_counter: int = 0
|
||||||
replanned: bool = False
|
replanned: bool = False
|
||||||
|
# U8/R8: spec review gate outcome. Set by _run_spec_review_gate_stream
|
||||||
|
# so the caller (a generator that cannot receive a return value) can
|
||||||
|
# read the result after the async-for loop drains. Values: "approved",
|
||||||
|
# "parked", "replan_exhausted". Empty = gate did not run.
|
||||||
|
spec_review_decision: str = ""
|
||||||
|
# The (possibly replanned) plan to continue execution with on "approved".
|
||||||
|
spec_review_plan: "ExecutionPlan | None" = None
|
||||||
|
|
||||||
|
|
||||||
class PlanExecEngine:
|
class PlanExecEngine:
|
||||||
|
|
@ -96,6 +116,11 @@ class PlanExecEngine:
|
||||||
# historical pitfalls by goal/skill similarity and inject into
|
# historical pitfalls by goal/skill similarity and inject into
|
||||||
# system prompt. None = skip injection (no error).
|
# system prompt. None = skip injection (no error).
|
||||||
pitfall_detector: "PitfallDetector | None" = None,
|
pitfall_detector: "PitfallDetector | None" = None,
|
||||||
|
# U8/R8: spec review gate handler. When set, PLAN_EXEC pauses after
|
||||||
|
# the first Spec is generated and calls this handler to wait for the
|
||||||
|
# user's decision. None = skip the gate (backward compat — the engine
|
||||||
|
# proceeds directly to execution after Spec persistence).
|
||||||
|
spec_review_handler: SpecReviewHandler | None = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
|
|
@ -116,6 +141,10 @@ class PlanExecEngine:
|
||||||
pitfall_detector: U7/R12 — PitfallDetector 单例(KTD-5)。
|
pitfall_detector: U7/R12 — PitfallDetector 单例(KTD-5)。
|
||||||
规划阶段按 goal/skill 相似度检索历史 pitfall 并注入 system
|
规划阶段按 goal/skill 相似度检索历史 pitfall 并注入 system
|
||||||
prompt。None 表示跳过注入(不报错)。
|
prompt。None 表示跳过注入(不报错)。
|
||||||
|
spec_review_handler: U8/R8 — async handler called as
|
||||||
|
``(spec_id, goal, steps) -> (decision, feedback)`` after the
|
||||||
|
first Spec is generated. Suspends execution until the user
|
||||||
|
replies. None skips the gate (backward compat).
|
||||||
"""
|
"""
|
||||||
self._llm_gateway = llm_gateway
|
self._llm_gateway = llm_gateway
|
||||||
self._max_replans = max_replans
|
self._max_replans = max_replans
|
||||||
|
|
@ -128,6 +157,8 @@ class PlanExecEngine:
|
||||||
self._verification_commands = verification_commands
|
self._verification_commands = verification_commands
|
||||||
# U7/R12: app-state singleton (KTD-5) — constructor injection.
|
# U7/R12: app-state singleton (KTD-5) — constructor injection.
|
||||||
self._pitfall_detector = pitfall_detector
|
self._pitfall_detector = pitfall_detector
|
||||||
|
# U8/R8: spec review gate handler. None = skip gate (backward compat).
|
||||||
|
self._spec_review_handler = spec_review_handler
|
||||||
# U4/R11: copy the default to avoid mutating the module-level dict.
|
# U4/R11: copy the default to avoid mutating the module-level dict.
|
||||||
self._phase_budgets = (
|
self._phase_budgets = (
|
||||||
dict(phase_budgets) if phase_budgets is not None else dict(_DEFAULT_PHASE_BUDGETS)
|
dict(phase_budgets) if phase_budgets is not None else dict(_DEFAULT_PHASE_BUDGETS)
|
||||||
|
|
@ -328,8 +359,9 @@ class PlanExecEngine:
|
||||||
)
|
)
|
||||||
|
|
||||||
# Persist plan as Spec if spec_manager is provided
|
# Persist plan as Spec if spec_manager is provided
|
||||||
|
current_plan = plan
|
||||||
if self._spec_manager is not None:
|
if self._spec_manager is not None:
|
||||||
spec = self._plan_to_spec(plan)
|
spec = self._plan_to_spec(current_plan)
|
||||||
self._spec_manager.create(spec)
|
self._spec_manager.create(spec)
|
||||||
state.step_counter += 1
|
state.step_counter += 1
|
||||||
yield ReActEvent(
|
yield ReActEvent(
|
||||||
|
|
@ -338,8 +370,77 @@ class PlanExecEngine:
|
||||||
data={"spec_id": spec.spec_id, "goal": spec.goal, "num_steps": len(spec.steps)},
|
data={"spec_id": spec.spec_id, "goal": spec.goal, "num_steps": len(spec.steps)},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# U8/R8: Spec review gate — pause for user review after the
|
||||||
|
# first Spec is generated. approved → continue; rejected →
|
||||||
|
# replan (cap _MAX_SPEC_REVIEW_REPLANS); timeout → park (not
|
||||||
|
# fail). Handler is None → skip (backward compat). The gate
|
||||||
|
# only runs when a spec_manager is present (no Spec = nothing
|
||||||
|
# to review); spec_manager is None + handler set → skip + warn.
|
||||||
|
if self._spec_review_handler is not None:
|
||||||
|
# The gate is an async generator: it yields the
|
||||||
|
# spec_review_request / spec_review_reply / spec_created
|
||||||
|
# events and writes the outcome onto ``state`` (a generator
|
||||||
|
# cannot return a value). We re-yield its events here.
|
||||||
|
async for gate_event in self._run_spec_review_gate_stream(
|
||||||
|
spec=spec,
|
||||||
|
goal=goal,
|
||||||
|
system_prompt=system_prompt,
|
||||||
|
task_type=task_type,
|
||||||
|
available_skills=available_skills,
|
||||||
|
state=state,
|
||||||
|
cancellation_token=cancellation_token,
|
||||||
|
):
|
||||||
|
yield gate_event
|
||||||
|
|
||||||
|
decision = state.spec_review_decision
|
||||||
|
if decision == "parked":
|
||||||
|
trace_outcome = "parked"
|
||||||
|
output = f"Spec {spec.spec_id} parked: review timed out."
|
||||||
|
state.step_counter += 1
|
||||||
|
yield ReActEvent(
|
||||||
|
event_type="final_answer",
|
||||||
|
step=state.step_counter,
|
||||||
|
data={
|
||||||
|
"output": output,
|
||||||
|
"total_steps": len(state.trajectory),
|
||||||
|
"total_tokens": state.total_tokens,
|
||||||
|
"plan_id": current_plan.plan_id,
|
||||||
|
"plan_status": "parked",
|
||||||
|
"replanned": state.replanned,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return
|
||||||
|
if decision == "replan_exhausted":
|
||||||
|
trace_outcome = "error"
|
||||||
|
output = (
|
||||||
|
"Spec review failed: replan cap "
|
||||||
|
f"({_MAX_SPEC_REVIEW_REPLANS}) exceeded after "
|
||||||
|
"repeated rejections."
|
||||||
|
)
|
||||||
|
state.step_counter += 1
|
||||||
|
yield ReActEvent(
|
||||||
|
event_type="final_answer",
|
||||||
|
step=state.step_counter,
|
||||||
|
data={
|
||||||
|
"output": output,
|
||||||
|
"total_steps": len(state.trajectory),
|
||||||
|
"total_tokens": state.total_tokens,
|
||||||
|
"plan_id": current_plan.plan_id,
|
||||||
|
"plan_status": "failed",
|
||||||
|
"replanned": state.replanned,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return
|
||||||
|
# approved — new_plan may differ if a replan happened
|
||||||
|
current_plan = state.spec_review_plan or current_plan
|
||||||
|
elif self._spec_review_handler is not None:
|
||||||
|
# spec_manager is None → no Spec to review; skip gate + warn.
|
||||||
|
logger.warning(
|
||||||
|
"spec_review_handler set but spec_manager is None — "
|
||||||
|
"skipping spec review gate (no Spec to review)"
|
||||||
|
)
|
||||||
|
|
||||||
# ── Phase 2 & 3: Execute with optional replanning ──
|
# ── Phase 2 & 3: Execute with optional replanning ──
|
||||||
current_plan = plan
|
|
||||||
replan_count = 0
|
replan_count = 0
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|
@ -585,6 +686,248 @@ class PlanExecEngine:
|
||||||
return f"{system_prompt}\n\n{section}"
|
return f"{system_prompt}\n\n{section}"
|
||||||
return section
|
return section
|
||||||
|
|
||||||
|
async def _emit_callback_safe(self, event_type: str, data: dict[str, Any]) -> None:
|
||||||
|
"""U8: emit a step event via the non-stream callback, swallowing the
|
||||||
|
same exception families the existing callback call sites swallow.
|
||||||
|
|
||||||
|
ponytail: tiny helper exists only because the spec review gate emits
|
||||||
|
several events on the non-streaming path — inlining the try/except 4x
|
||||||
|
would be noisier than this 4-line method.
|
||||||
|
"""
|
||||||
|
if not self._step_event_callback:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
await self._step_event_callback(event_type, data)
|
||||||
|
except (
|
||||||
|
RuntimeError,
|
||||||
|
ValueError,
|
||||||
|
TypeError,
|
||||||
|
KeyError,
|
||||||
|
AttributeError,
|
||||||
|
ConnectionError,
|
||||||
|
asyncio.TimeoutError,
|
||||||
|
) as e:
|
||||||
|
logger.warning(f"Step event callback failed: {e}")
|
||||||
|
|
||||||
|
async def _run_spec_review_gate_stream(
|
||||||
|
self,
|
||||||
|
spec: Spec,
|
||||||
|
goal: str,
|
||||||
|
system_prompt: str | None,
|
||||||
|
task_type: str,
|
||||||
|
available_skills: list[str],
|
||||||
|
state: "_StreamState",
|
||||||
|
cancellation_token: CancellationToken | None,
|
||||||
|
):
|
||||||
|
"""U8/R8: spec review gate for the streaming path (async generator).
|
||||||
|
|
||||||
|
Yields ``spec_review_request`` / ``spec_review_reply`` / ``spec_created``
|
||||||
|
events and writes the outcome onto ``state.spec_review_decision``
|
||||||
|
(``approved`` | ``parked`` | ``replan_exhausted``) plus
|
||||||
|
``state.spec_review_plan`` (the plan to continue with on approval; None
|
||||||
|
when no replan happened — caller falls back to the original plan).
|
||||||
|
|
||||||
|
The handler raises ``asyncio.TimeoutError`` on the boundary timeout
|
||||||
|
(chat.py: 30 min); we park the Spec and surface ``parked`` (not
|
||||||
|
``failed``) so the user can resume on return.
|
||||||
|
"""
|
||||||
|
spec_review_replan_count = 0
|
||||||
|
current_spec = spec
|
||||||
|
# None until a rejection triggers a replan; on approval without a
|
||||||
|
# replan the caller keeps the original plan.
|
||||||
|
new_plan: ExecutionPlan | None = None
|
||||||
|
while True:
|
||||||
|
if cancellation_token is not None:
|
||||||
|
cancellation_token.check()
|
||||||
|
|
||||||
|
steps_summary = [
|
||||||
|
{"step_id": s.step_id, "name": s.name, "description": s.description}
|
||||||
|
for s in current_spec.steps
|
||||||
|
]
|
||||||
|
spec_review_id = f"{current_spec.spec_id}:spec_review"
|
||||||
|
state.step_counter += 1
|
||||||
|
yield ReActEvent(
|
||||||
|
event_type="spec_review_request",
|
||||||
|
step=state.step_counter,
|
||||||
|
data={
|
||||||
|
"spec_id": current_spec.spec_id,
|
||||||
|
"spec_review_id": spec_review_id,
|
||||||
|
"goal": current_spec.goal,
|
||||||
|
"steps": steps_summary,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
decision, feedback = await self._spec_review_handler(
|
||||||
|
current_spec.spec_id, current_spec.goal, steps_summary
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
# Boundary timeout (chat.py 30-min wait_for). Park the Spec
|
||||||
|
# so the user can resume on return — NOT failed.
|
||||||
|
self._spec_manager.park(current_spec.spec_id)
|
||||||
|
state.step_counter += 1
|
||||||
|
yield ReActEvent(
|
||||||
|
event_type="spec_review_reply",
|
||||||
|
step=state.step_counter,
|
||||||
|
data={
|
||||||
|
"spec_id": current_spec.spec_id,
|
||||||
|
"spec_review_id": spec_review_id,
|
||||||
|
"decision": "timeout",
|
||||||
|
"status": "parked",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
state.spec_review_decision = "parked"
|
||||||
|
state.spec_review_plan = None
|
||||||
|
return
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
# Stream cancelled mid-review: propagate so the outer
|
||||||
|
# try/except sets trace_outcome and the WS loop cancels the
|
||||||
|
# pending future (no orphan future / deadlock).
|
||||||
|
raise
|
||||||
|
|
||||||
|
state.step_counter += 1
|
||||||
|
yield ReActEvent(
|
||||||
|
event_type="spec_review_reply",
|
||||||
|
step=state.step_counter,
|
||||||
|
data={
|
||||||
|
"spec_id": current_spec.spec_id,
|
||||||
|
"spec_review_id": spec_review_id,
|
||||||
|
"decision": decision,
|
||||||
|
"feedback": feedback,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if decision == "approved":
|
||||||
|
state.spec_review_decision = "approved"
|
||||||
|
state.spec_review_plan = new_plan
|
||||||
|
return
|
||||||
|
|
||||||
|
# rejected → replan (cap prevents an infinite reject→replan loop)
|
||||||
|
if spec_review_replan_count >= _MAX_SPEC_REVIEW_REPLANS:
|
||||||
|
state.spec_review_decision = "replan_exhausted"
|
||||||
|
state.spec_review_plan = None
|
||||||
|
return
|
||||||
|
|
||||||
|
spec_review_replan_count += 1
|
||||||
|
new_plan = await self._planner.generate_plan(
|
||||||
|
goal=goal,
|
||||||
|
context={
|
||||||
|
"system_prompt": system_prompt,
|
||||||
|
"task_type": task_type,
|
||||||
|
"rejection_feedback": feedback,
|
||||||
|
},
|
||||||
|
available_skills=available_skills,
|
||||||
|
)
|
||||||
|
current_spec = self._plan_to_spec(new_plan)
|
||||||
|
self._spec_manager.create(current_spec)
|
||||||
|
state.step_counter += 1
|
||||||
|
yield ReActEvent(
|
||||||
|
event_type="spec_created",
|
||||||
|
step=state.step_counter,
|
||||||
|
data={
|
||||||
|
"spec_id": current_spec.spec_id,
|
||||||
|
"goal": current_spec.goal,
|
||||||
|
"num_steps": len(current_spec.steps),
|
||||||
|
"replan_count": spec_review_replan_count,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
# loop back to review the regenerated Spec
|
||||||
|
|
||||||
|
async def _run_spec_review_gate_nonstream(
|
||||||
|
self,
|
||||||
|
spec: Spec,
|
||||||
|
goal: str,
|
||||||
|
system_prompt: str | None,
|
||||||
|
task_type: str,
|
||||||
|
available_skills: list[str],
|
||||||
|
cancellation_token: CancellationToken | None,
|
||||||
|
) -> tuple[str, ExecutionPlan | None]:
|
||||||
|
"""U8/R8: spec review gate for the non-streaming path.
|
||||||
|
|
||||||
|
Returns ``(decision, new_plan)`` where decision is ``approved``,
|
||||||
|
``parked``, or ``replan_exhausted``; new_plan is the replanned plan
|
||||||
|
on approval-after-rejection (None when no replan happened). Emits
|
||||||
|
events via ``_step_event_callback``.
|
||||||
|
"""
|
||||||
|
spec_review_replan_count = 0
|
||||||
|
current_spec = spec
|
||||||
|
new_plan: ExecutionPlan | None = None
|
||||||
|
while True:
|
||||||
|
if cancellation_token is not None:
|
||||||
|
cancellation_token.check()
|
||||||
|
|
||||||
|
steps_summary = [
|
||||||
|
{"step_id": s.step_id, "name": s.name, "description": s.description}
|
||||||
|
for s in current_spec.steps
|
||||||
|
]
|
||||||
|
spec_review_id = f"{current_spec.spec_id}:spec_review"
|
||||||
|
await self._emit_callback_safe(
|
||||||
|
"spec_review_request",
|
||||||
|
{
|
||||||
|
"spec_id": current_spec.spec_id,
|
||||||
|
"spec_review_id": spec_review_id,
|
||||||
|
"goal": current_spec.goal,
|
||||||
|
"steps": steps_summary,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
decision, feedback = await self._spec_review_handler(
|
||||||
|
current_spec.spec_id, current_spec.goal, steps_summary
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
self._spec_manager.park(current_spec.spec_id)
|
||||||
|
await self._emit_callback_safe(
|
||||||
|
"spec_review_reply",
|
||||||
|
{
|
||||||
|
"spec_id": current_spec.spec_id,
|
||||||
|
"spec_review_id": spec_review_id,
|
||||||
|
"decision": "timeout",
|
||||||
|
"status": "parked",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return ("parked", None)
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
|
|
||||||
|
await self._emit_callback_safe(
|
||||||
|
"spec_review_reply",
|
||||||
|
{
|
||||||
|
"spec_id": current_spec.spec_id,
|
||||||
|
"spec_review_id": spec_review_id,
|
||||||
|
"decision": decision,
|
||||||
|
"feedback": feedback,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if decision == "approved":
|
||||||
|
return ("approved", new_plan)
|
||||||
|
|
||||||
|
if spec_review_replan_count >= _MAX_SPEC_REVIEW_REPLANS:
|
||||||
|
return ("replan_exhausted", None)
|
||||||
|
|
||||||
|
spec_review_replan_count += 1
|
||||||
|
new_plan = await self._planner.generate_plan(
|
||||||
|
goal=goal,
|
||||||
|
context={
|
||||||
|
"system_prompt": system_prompt,
|
||||||
|
"task_type": task_type,
|
||||||
|
"rejection_feedback": feedback,
|
||||||
|
},
|
||||||
|
available_skills=available_skills,
|
||||||
|
)
|
||||||
|
current_spec = self._plan_to_spec(new_plan)
|
||||||
|
self._spec_manager.create(current_spec)
|
||||||
|
await self._emit_callback_safe(
|
||||||
|
"spec_created",
|
||||||
|
{
|
||||||
|
"spec_id": current_spec.spec_id,
|
||||||
|
"goal": current_spec.goal,
|
||||||
|
"num_steps": len(current_spec.steps),
|
||||||
|
"replan_count": spec_review_replan_count,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
async def _execute_loop(
|
async def _execute_loop(
|
||||||
self,
|
self,
|
||||||
messages: list[dict[str, str]],
|
messages: list[dict[str, str]],
|
||||||
|
|
@ -688,29 +1031,57 @@ class PlanExecEngine:
|
||||||
)
|
)
|
||||||
|
|
||||||
# Persist plan as Spec if spec_manager is provided
|
# Persist plan as Spec if spec_manager is provided
|
||||||
|
current_plan = plan
|
||||||
if self._spec_manager is not None:
|
if self._spec_manager is not None:
|
||||||
spec = self._plan_to_spec(plan)
|
spec = self._plan_to_spec(current_plan)
|
||||||
self._spec_manager.create(spec)
|
self._spec_manager.create(spec)
|
||||||
if self._step_event_callback:
|
await self._emit_callback_safe(
|
||||||
try:
|
"spec_created",
|
||||||
await self._step_event_callback(
|
{
|
||||||
"spec_created",
|
"spec_id": spec.spec_id,
|
||||||
{
|
"goal": spec.goal,
|
||||||
"spec_id": spec.spec_id,
|
"num_steps": len(spec.steps),
|
||||||
"goal": spec.goal,
|
},
|
||||||
"num_steps": len(spec.steps),
|
)
|
||||||
},
|
|
||||||
|
# U8/R8: spec review gate (non-streaming). See
|
||||||
|
# _run_spec_review_gate_stream for the streaming twin.
|
||||||
|
if self._spec_review_handler is not None:
|
||||||
|
decision, new_plan = await self._run_spec_review_gate_nonstream(
|
||||||
|
spec=spec,
|
||||||
|
goal=goal,
|
||||||
|
system_prompt=system_prompt,
|
||||||
|
task_type=task_type,
|
||||||
|
available_skills=available_skills,
|
||||||
|
cancellation_token=cancellation_token,
|
||||||
|
)
|
||||||
|
if decision == "parked":
|
||||||
|
return ReActResult(
|
||||||
|
output=f"Spec {spec.spec_id} parked: review timed out.",
|
||||||
|
trajectory=trajectory,
|
||||||
|
total_steps=len(trajectory),
|
||||||
|
total_tokens=total_tokens,
|
||||||
|
status="parked",
|
||||||
)
|
)
|
||||||
except (
|
if decision == "replan_exhausted":
|
||||||
RuntimeError,
|
return ReActResult(
|
||||||
ValueError,
|
output=(
|
||||||
TypeError,
|
"Spec review failed: replan cap "
|
||||||
KeyError,
|
f"({_MAX_SPEC_REVIEW_REPLANS}) exceeded after "
|
||||||
AttributeError,
|
"repeated rejections."
|
||||||
ConnectionError,
|
),
|
||||||
asyncio.TimeoutError,
|
trajectory=trajectory,
|
||||||
) as e:
|
total_steps=len(trajectory),
|
||||||
logger.warning(f"Step event callback failed: {e}")
|
total_tokens=total_tokens,
|
||||||
|
status="error",
|
||||||
|
)
|
||||||
|
# approved — new_plan may differ if a replan happened
|
||||||
|
current_plan = new_plan or current_plan
|
||||||
|
elif self._spec_review_handler is not None:
|
||||||
|
logger.warning(
|
||||||
|
"spec_review_handler set but spec_manager is None — "
|
||||||
|
"skipping spec review gate (no Spec to review)"
|
||||||
|
)
|
||||||
|
|
||||||
if trace_recorder is not None:
|
if trace_recorder is not None:
|
||||||
trace_recorder.record_step(
|
trace_recorder.record_step(
|
||||||
|
|
@ -721,7 +1092,7 @@ class PlanExecEngine:
|
||||||
|
|
||||||
# ── Phase 2 & 3: Execute with replanning ──
|
# ── Phase 2 & 3: Execute with replanning ──
|
||||||
plan_result, trajectory, total_tokens = await self._execute_with_replanning(
|
plan_result, trajectory, total_tokens = await self._execute_with_replanning(
|
||||||
plan=plan,
|
plan=current_plan,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
tools=tools,
|
tools=tools,
|
||||||
model=model,
|
model=model,
|
||||||
|
|
@ -736,7 +1107,7 @@ class PlanExecEngine:
|
||||||
)
|
)
|
||||||
|
|
||||||
# 聚合输出
|
# 聚合输出
|
||||||
output = self._aggregate_output(plan, plan_result)
|
output = self._aggregate_output(current_plan, plan_result)
|
||||||
|
|
||||||
# 确定状态
|
# 确定状态
|
||||||
if plan_result.status == TaskStatus.FAILED:
|
if plan_result.status == TaskStatus.FAILED:
|
||||||
|
|
|
||||||
|
|
@ -35,7 +35,10 @@ class Spec:
|
||||||
spec_id: str
|
spec_id: str
|
||||||
goal: str
|
goal: str
|
||||||
steps: list[SpecStep] = field(default_factory=list)
|
steps: list[SpecStep] = field(default_factory=list)
|
||||||
status: str = "draft" # draft | confirmed | executing | completed | failed
|
# draft | confirmed | executing | completed | failed | parked
|
||||||
|
# U8/R8: "parked" is set when the spec review gate times out (30 min).
|
||||||
|
# A parked spec is NOT failed — the user can resume the review on return.
|
||||||
|
status: str = "draft"
|
||||||
created_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
created_at: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
||||||
confirmed_at: str | None = None
|
confirmed_at: str | None = None
|
||||||
metadata: dict[str, Any] = field(default_factory=dict)
|
metadata: dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
@ -61,7 +64,9 @@ class SpecManager:
|
||||||
"""Persist a Spec to disk. Returns the file path."""
|
"""Persist a Spec to disk. Returns the file path."""
|
||||||
path = self._specs_dir / f"{spec.spec_id}.yaml"
|
path = self._specs_dir / f"{spec.spec_id}.yaml"
|
||||||
data = asdict(spec)
|
data = asdict(spec)
|
||||||
path.write_text(yaml.dump(data, allow_unicode=True, default_flow_style=False), encoding="utf-8")
|
path.write_text(
|
||||||
|
yaml.dump(data, allow_unicode=True, default_flow_style=False), encoding="utf-8"
|
||||||
|
)
|
||||||
self._cache[spec.spec_id] = spec
|
self._cache[spec.spec_id] = spec
|
||||||
logger.info(f"Spec created: {spec.spec_id} -> {path}")
|
logger.info(f"Spec created: {spec.spec_id} -> {path}")
|
||||||
return path
|
return path
|
||||||
|
|
@ -117,6 +122,42 @@ class SpecManager:
|
||||||
logger.info(f"Spec confirmed: {spec_id}")
|
logger.info(f"Spec confirmed: {spec_id}")
|
||||||
return spec
|
return spec
|
||||||
|
|
||||||
|
def park(self, spec_id: str) -> Spec | None:
|
||||||
|
"""U8/R8: Park a spec when the review gate times out.
|
||||||
|
|
||||||
|
A parked spec is distinct from a failed spec — the user can resume
|
||||||
|
the review flow on return (see ``resume``). Mirrors ``confirm``.
|
||||||
|
"""
|
||||||
|
spec = self.get(spec_id)
|
||||||
|
if spec is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
spec.status = "parked"
|
||||||
|
self.create(spec) # re-persist
|
||||||
|
logger.info(f"Spec parked: {spec_id}")
|
||||||
|
return spec
|
||||||
|
|
||||||
|
def resume(self, spec_id: str) -> Spec | None:
|
||||||
|
"""U8/R8: Un-park a spec back to ``draft`` so the review flow restarts.
|
||||||
|
|
||||||
|
Only valid when status == "parked". Returns the spec unchanged (no-op,
|
||||||
|
logged) when the spec is not parked — ponytail: no-op over raise keeps
|
||||||
|
callers simple; an idempotent resume is safer than crashing on a
|
||||||
|
double-resume. Returns None when the spec does not exist.
|
||||||
|
"""
|
||||||
|
spec = self.get(spec_id)
|
||||||
|
if spec is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if spec.status != "parked":
|
||||||
|
logger.warning(f"Spec {spec_id} not parked (status={spec.status}), resume is a no-op")
|
||||||
|
return spec
|
||||||
|
|
||||||
|
spec.status = "draft"
|
||||||
|
self.create(spec) # re-persist
|
||||||
|
logger.info(f"Spec resumed: {spec_id}")
|
||||||
|
return spec
|
||||||
|
|
||||||
def list_specs(self, status: str | None = None) -> list[Spec]:
|
def list_specs(self, status: str | None = None) -> list[Spec]:
|
||||||
"""List all specs, optionally filtered by status. Sorted by created_at desc."""
|
"""List all specs, optionally filtered by status. Sorted by created_at desc."""
|
||||||
specs: list[Spec] = []
|
specs: list[Spec] = []
|
||||||
|
|
|
||||||
|
|
@ -169,6 +169,11 @@ _VALID_TEAM_EVENT_TYPES = frozenset(
|
||||||
"round_summary",
|
"round_summary",
|
||||||
"user_intervention",
|
"user_intervention",
|
||||||
"board_concluded",
|
"board_concluded",
|
||||||
|
# U8/R8: spec review gate events (PLAN_EXEC pauses for user review).
|
||||||
|
# Without this whitelist entry the events silently no-op (per the
|
||||||
|
# streaming-event-contract-residuals learning).
|
||||||
|
"spec_review_request",
|
||||||
|
"spec_review_reply",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -1005,6 +1010,9 @@ async def chat_websocket(websocket: WebSocket, session_id: str) -> None:
|
||||||
# Track pending replies for AskHumanTool and confirmations
|
# Track pending replies for AskHumanTool and confirmations
|
||||||
pending_replies: dict[str, asyncio.Future] = {}
|
pending_replies: dict[str, asyncio.Future] = {}
|
||||||
pending_confirmations: dict[str, asyncio.Future] = {}
|
pending_confirmations: dict[str, asyncio.Future] = {}
|
||||||
|
# U8/R8: pending spec-review futures keyed by spec_review_id. Resolved
|
||||||
|
# by the spec_review_reply client message; cancelled on WS teardown.
|
||||||
|
pending_spec_reviews: dict[str, asyncio.Future] = {}
|
||||||
chat_manager.add(session_id, websocket, pending_replies)
|
chat_manager.add(session_id, websocket, pending_replies)
|
||||||
|
|
||||||
cancellation_token = CancellationToken()
|
cancellation_token = CancellationToken()
|
||||||
|
|
@ -1086,6 +1094,7 @@ async def chat_websocket(websocket: WebSocket, session_id: str) -> None:
|
||||||
message_token,
|
message_token,
|
||||||
pending_replies,
|
pending_replies,
|
||||||
pending_confirmations,
|
pending_confirmations,
|
||||||
|
pending_spec_reviews,
|
||||||
model_override=model,
|
model_override=model,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
@ -1114,6 +1123,29 @@ async def chat_websocket(websocket: WebSocket, session_id: str) -> None:
|
||||||
f"Confirmation {confirmation_id!r} not found in pending_confirmations"
|
f"Confirmation {confirmation_id!r} not found in pending_confirmations"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
elif msg_type == "spec_review_reply":
|
||||||
|
# U8/R8: Reply to a spec review request. The client sends
|
||||||
|
# {spec_review_id, decision: "approved"|"rejected", feedback}.
|
||||||
|
# An unknown spec_review_id is logged + ignored (no crash) —
|
||||||
|
# e.g. a stale reply arriving after the future was popped.
|
||||||
|
spec_review_id = msg.get("spec_review_id")
|
||||||
|
decision = msg.get("decision", "rejected")
|
||||||
|
feedback = msg.get("feedback", "")
|
||||||
|
logger.info(
|
||||||
|
f"Received spec_review_reply: id={spec_review_id!r}, decision={decision!r}"
|
||||||
|
)
|
||||||
|
if spec_review_id and spec_review_id in pending_spec_reviews:
|
||||||
|
fut = pending_spec_reviews[spec_review_id]
|
||||||
|
if not fut.done():
|
||||||
|
fut.set_result((decision, feedback))
|
||||||
|
else:
|
||||||
|
logger.warning(f"spec_review_reply {spec_review_id!r} already resolved")
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
f"spec_review_reply {spec_review_id!r} not found in "
|
||||||
|
f"pending_spec_reviews — ignoring"
|
||||||
|
)
|
||||||
|
|
||||||
elif msg_type == "cancel":
|
elif msg_type == "cancel":
|
||||||
cancellation_token.cancel()
|
cancellation_token.cancel()
|
||||||
await websocket.send_json({"type": "result", "data": {"status": "cancelled"}})
|
await websocket.send_json({"type": "result", "data": {"status": "cancelled"}})
|
||||||
|
|
@ -1139,6 +1171,9 @@ async def chat_websocket(websocket: WebSocket, session_id: str) -> None:
|
||||||
for fut in pending_confirmations.values():
|
for fut in pending_confirmations.values():
|
||||||
if not fut.done():
|
if not fut.done():
|
||||||
fut.cancel()
|
fut.cancel()
|
||||||
|
for fut in pending_spec_reviews.values():
|
||||||
|
if not fut.done():
|
||||||
|
fut.cancel()
|
||||||
chat_manager.remove(session_id, websocket)
|
chat_manager.remove(session_id, websocket)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1150,6 +1185,7 @@ async def _handle_chat_message(
|
||||||
cancellation_token: CancellationToken,
|
cancellation_token: CancellationToken,
|
||||||
pending_replies: dict[str, asyncio.Future],
|
pending_replies: dict[str, asyncio.Future],
|
||||||
pending_confirmations: dict[str, asyncio.Future] | None = None,
|
pending_confirmations: dict[str, asyncio.Future] | None = None,
|
||||||
|
pending_spec_reviews: dict[str, asyncio.Future] | None = None,
|
||||||
model_override: str | None = None,
|
model_override: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Handle a user message: append to session, execute Agent, stream events.
|
"""Handle a user message: append to session, execute Agent, stream events.
|
||||||
|
|
@ -1404,6 +1440,63 @@ async def _handle_chat_message(
|
||||||
finally:
|
finally:
|
||||||
_pending_confirmations.pop(confirmation_id, None)
|
_pending_confirmations.pop(confirmation_id, None)
|
||||||
|
|
||||||
|
# U8/R8: spec review handler — only wired when the engine is a
|
||||||
|
# PlanExecEngine (the WS path's _build_phase_engine returns a ReActEngine
|
||||||
|
# with phase_policy, so this is a no-op there; REST/tests that use
|
||||||
|
# PlanExecEngine get the gate). Different semantics from _confirmation_
|
||||||
|
# handler: 30-min timeout (long task user availability) vs 5-min, returns
|
||||||
|
# (decision, feedback) tuple not bool, and on timeout RAISES
|
||||||
|
# asyncio.TimeoutError so the engine can park the Spec (not fail it).
|
||||||
|
_pending_spec_reviews = pending_spec_reviews if pending_spec_reviews is not None else {}
|
||||||
|
|
||||||
|
async def _spec_review_handler(spec_id: str, goal: str, steps: list[dict]) -> tuple[str, str]:
|
||||||
|
"""Send spec_review_request to frontend and wait for the user's decision.
|
||||||
|
|
||||||
|
Returns (decision, feedback). Raises asyncio.TimeoutError after 30 min
|
||||||
|
(the engine parks the Spec on timeout). Raises asyncio.CancelledError
|
||||||
|
if the stream is cancelled mid-review.
|
||||||
|
"""
|
||||||
|
# spec_review_id MUST match the engine's format (f"{spec_id}:spec_review")
|
||||||
|
# — one review per spec (stable identifier, terminal-event symmetry).
|
||||||
|
spec_review_id = f"{spec_id}:spec_review"
|
||||||
|
await websocket.send_json(
|
||||||
|
{
|
||||||
|
"type": "spec_review_request",
|
||||||
|
"data": {
|
||||||
|
"spec_id": spec_id,
|
||||||
|
"spec_review_id": spec_review_id,
|
||||||
|
"goal": goal,
|
||||||
|
"steps": steps,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
future: asyncio.Future[tuple[str, str]] = loop.create_future()
|
||||||
|
_pending_spec_reviews[spec_review_id] = future
|
||||||
|
logger.info(f"Spec review request {spec_review_id} sent, waiting for reply")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 30 min (1800s) — long-task user availability per R8. The engine
|
||||||
|
# catches TimeoutError and parks the Spec (status="parked", not
|
||||||
|
# "failed") so the user can resume on return.
|
||||||
|
decision, feedback = await asyncio.wait_for(future, timeout=1800.0)
|
||||||
|
logger.info(f"Spec review {spec_review_id} resolved: decision={decision!r}")
|
||||||
|
return decision, feedback
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
logger.warning(f"Spec review {spec_review_id} timed out (30 min)")
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
_pending_spec_reviews.pop(spec_review_id, None)
|
||||||
|
|
||||||
|
# Wire the handler onto a PlanExecEngine only (the WS PLAN_EXEC path uses
|
||||||
|
# a ReActEngine + phase_policy, where this is a no-op). Local import to
|
||||||
|
# avoid a top-level dependency that the WS path doesn't need.
|
||||||
|
from agentkit.core.plan_exec_engine import PlanExecEngine as _PlanExecEngine
|
||||||
|
|
||||||
|
if isinstance(react_engine, _PlanExecEngine):
|
||||||
|
react_engine._spec_review_handler = _spec_review_handler
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Chat session {session_id}: executing with {len(routing.tools)} tools, model={routing.model}, skill={routing.skill_name}"
|
f"Chat session {session_id}: executing with {len(routing.tools)} tools, model={routing.model}, skill={routing.skill_name}"
|
||||||
)
|
)
|
||||||
|
|
@ -1479,6 +1572,22 @@ async def _handle_chat_message(
|
||||||
"data": event.data,
|
"data": event.data,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
elif event.event_type == "spec_review_request":
|
||||||
|
# U8/R8: the _spec_review_handler closure already sent this
|
||||||
|
# request directly to the frontend (it owns the spec_review_id
|
||||||
|
# + future). Swallow the engine's informational event to avoid
|
||||||
|
# a duplicate render (mirrors confirmation_request → pass).
|
||||||
|
pass
|
||||||
|
elif event.event_type == "spec_review_reply":
|
||||||
|
# Forward the engine's reply event so the frontend learns the
|
||||||
|
# outcome — especially the timeout→parked transition, which
|
||||||
|
# the frontend cannot infer (the user never replied).
|
||||||
|
await websocket.send_json(
|
||||||
|
{
|
||||||
|
"type": "spec_review_reply",
|
||||||
|
"data": event.data,
|
||||||
|
}
|
||||||
|
)
|
||||||
elif event.event_type == "phase_violation":
|
elif event.event_type == "phase_violation":
|
||||||
# Wave 4 U2: forward phase violations to the client so the
|
# Wave 4 U2: forward phase violations to the client so the
|
||||||
# frontend can surface them in the PhaseIndicator UI (alongside
|
# frontend can surface them in the PhaseIndicator UI (alongside
|
||||||
|
|
|
||||||
|
|
@ -908,6 +908,12 @@ _PERSISTED_MESSAGE_FIELDS = (
|
||||||
"routing_method",
|
"routing_method",
|
||||||
"thinking",
|
"thinking",
|
||||||
"tool_calls",
|
"tool_calls",
|
||||||
|
# U8/R8: spec review gate fields — a pending spec_review_request must
|
||||||
|
# survive a page reload so the user can still answer it (and a parked
|
||||||
|
# Spec is resumable on return).
|
||||||
|
"spec_review_id",
|
||||||
|
"spec_review_decision",
|
||||||
|
"spec_review_feedback",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,517 @@
|
||||||
|
"""Tests for U8: spec review gate (R8).
|
||||||
|
|
||||||
|
Covers:
|
||||||
|
- Happy path (AE4): PLAN_EXEC pauses for review, user approves, execution resumes
|
||||||
|
- Rejection -> replan -> re-review; replan cap (2) -> failure (not infinite loop)
|
||||||
|
- Timeout -> Spec parked (not failed); ReActResult status="parked"
|
||||||
|
- Stream cancelled mid-review -> CancelledError propagates, no deadlock
|
||||||
|
- spec_review_handler None -> backward compat (no gate)
|
||||||
|
- spec_manager None + handler set -> skip gate + warn
|
||||||
|
- Handler raises -> exception propagated
|
||||||
|
- SpecManager.park()/resume() round-trip; parked survives reload; confirm() works
|
||||||
|
- Whitelist assertion (silent no-op prevention)
|
||||||
|
- Unknown spec_review_id ignored (no crash)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import AsyncMock, MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from agentkit.core.exceptions import TaskCancelledError
|
||||||
|
from agentkit.core.plan_exec_engine import PlanExecEngine, _MAX_SPEC_REVIEW_REPLANS
|
||||||
|
from agentkit.core.plan_executor import PlanExecutionResult, StepExecutionResult
|
||||||
|
from agentkit.core.plan_schema import ExecutionPlan, PlanStep, PlanStepStatus
|
||||||
|
from agentkit.core.protocol import CancellationToken, TaskStatus
|
||||||
|
from agentkit.core.react import ReActResult
|
||||||
|
from agentkit.core.spec_manager import Spec, SpecManager, SpecStep
|
||||||
|
|
||||||
|
|
||||||
|
# ── Helpers ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def make_plan(
|
||||||
|
goal: str = "test goal",
|
||||||
|
plan_id: str = "plan-1",
|
||||||
|
steps: list[PlanStep] | None = None,
|
||||||
|
) -> ExecutionPlan:
|
||||||
|
"""Construct an ExecutionPlan with a distinct plan_id."""
|
||||||
|
if steps is None:
|
||||||
|
steps = [
|
||||||
|
PlanStep(step_id="step-0", name="Step 0", description="First step"),
|
||||||
|
PlanStep(step_id="step-1", name="Step 1", description="Second step"),
|
||||||
|
]
|
||||||
|
plan = ExecutionPlan(goal=goal, steps=steps)
|
||||||
|
plan.plan_id = plan_id
|
||||||
|
plan.parallel_groups = [[s.step_id] for s in steps]
|
||||||
|
return plan
|
||||||
|
|
||||||
|
|
||||||
|
def make_step_result(
|
||||||
|
step_id: str,
|
||||||
|
status: PlanStepStatus = PlanStepStatus.COMPLETED,
|
||||||
|
result: dict | None = None,
|
||||||
|
) -> StepExecutionResult:
|
||||||
|
return StepExecutionResult(
|
||||||
|
step_id=step_id,
|
||||||
|
status=status,
|
||||||
|
result=result or {"content": f"result of {step_id}"},
|
||||||
|
error=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def make_plan_result(
|
||||||
|
plan_id: str = "plan-1",
|
||||||
|
status: TaskStatus = TaskStatus.COMPLETED,
|
||||||
|
) -> PlanExecutionResult:
|
||||||
|
step_results = {
|
||||||
|
"step-0": make_step_result("step-0"),
|
||||||
|
"step-1": make_step_result("step-1"),
|
||||||
|
}
|
||||||
|
return PlanExecutionResult(
|
||||||
|
plan_id=plan_id,
|
||||||
|
step_results=step_results,
|
||||||
|
status=status,
|
||||||
|
total_duration_ms=100.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def make_spec(spec_id: str = "plan-1", goal: str = "test goal") -> Spec:
|
||||||
|
return Spec(
|
||||||
|
spec_id=spec_id,
|
||||||
|
goal=goal,
|
||||||
|
steps=[SpecStep(step_id="s1", name="Step 1", description="First")],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def make_engine(
|
||||||
|
specs_dir: str,
|
||||||
|
*,
|
||||||
|
spec_review_handler=None,
|
||||||
|
spec_manager: SpecManager | None = None,
|
||||||
|
step_event_callback=None,
|
||||||
|
) -> tuple[PlanExecEngine, SpecManager]:
|
||||||
|
"""Build a PlanExecEngine wired with a SpecManager (tmp dir)."""
|
||||||
|
mgr = spec_manager if spec_manager is not None else SpecManager(specs_dir=specs_dir)
|
||||||
|
engine = PlanExecEngine(
|
||||||
|
llm_gateway=None,
|
||||||
|
spec_manager=mgr,
|
||||||
|
spec_review_handler=spec_review_handler,
|
||||||
|
step_event_callback=step_event_callback,
|
||||||
|
)
|
||||||
|
return engine, mgr
|
||||||
|
|
||||||
|
|
||||||
|
def patch_executor(plan_result: PlanExecutionResult):
|
||||||
|
"""Patch PlanExecutor so execute() returns the given plan_result."""
|
||||||
|
mock_executor = MagicMock()
|
||||||
|
mock_executor.execute = AsyncMock(return_value=plan_result)
|
||||||
|
return patch("agentkit.core.plan_exec_engine.PlanExecutor", return_value=mock_executor)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Whitelist assertion ──────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestWhitelist:
|
||||||
|
"""Prevent silent no-op regression (streaming-event-contract learning)."""
|
||||||
|
|
||||||
|
def test_spec_review_events_in_whitelist(self):
|
||||||
|
from agentkit.server.routes.chat import _VALID_TEAM_EVENT_TYPES
|
||||||
|
|
||||||
|
assert "spec_review_request" in _VALID_TEAM_EVENT_TYPES
|
||||||
|
assert "spec_review_reply" in _VALID_TEAM_EVENT_TYPES
|
||||||
|
|
||||||
|
|
||||||
|
# ── Happy path (AE4) ─────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestHappyPathStream:
|
||||||
|
"""PLAN_EXEC generates Spec -> spec_review_request -> suspend -> approve -> resume."""
|
||||||
|
|
||||||
|
async def test_approve_resumes_execution(self, tmp_path: Path):
|
||||||
|
seen_calls: list[tuple[str, str, list]] = []
|
||||||
|
|
||||||
|
async def handler(spec_id: str, goal: str, steps: list[dict]):
|
||||||
|
seen_calls.append((spec_id, goal, steps))
|
||||||
|
return ("approved", "")
|
||||||
|
|
||||||
|
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
|
||||||
|
plan = make_plan(plan_id="plan-1")
|
||||||
|
plan_result = make_plan_result()
|
||||||
|
|
||||||
|
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
|
||||||
|
with patch_executor(plan_result):
|
||||||
|
events = [
|
||||||
|
e
|
||||||
|
async for e in engine.execute_stream(
|
||||||
|
messages=[{"role": "user", "content": "do a complex task"}],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
event_types = [e.event_type for e in events]
|
||||||
|
# Spec created, review request, review reply, then execution + final_answer
|
||||||
|
assert "spec_created" in event_types
|
||||||
|
assert "spec_review_request" in event_types
|
||||||
|
assert "spec_review_reply" in event_types
|
||||||
|
# request comes before reply (terminal-event symmetry / ordering)
|
||||||
|
assert event_types.index("spec_review_request") < event_types.index("spec_review_reply")
|
||||||
|
# Execution resumed after approval -> step events + final_answer
|
||||||
|
assert "final_answer" in event_types
|
||||||
|
final = next(e for e in events if e.event_type == "final_answer")
|
||||||
|
assert final.data["plan_status"] != "parked"
|
||||||
|
|
||||||
|
# Handler called with the spec_id matching the created spec, the goal,
|
||||||
|
# and a list of step dicts.
|
||||||
|
assert len(seen_calls) == 1
|
||||||
|
spec_id, goal, steps = seen_calls[0]
|
||||||
|
assert spec_id == "plan-1"
|
||||||
|
assert goal == "test goal"
|
||||||
|
assert isinstance(steps, list)
|
||||||
|
assert all("step_id" in s and "name" in s for s in steps)
|
||||||
|
|
||||||
|
async def test_nonstream_approve_returns_success(self, tmp_path: Path):
|
||||||
|
async def handler(spec_id, goal, steps):
|
||||||
|
return ("approved", "")
|
||||||
|
|
||||||
|
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
|
||||||
|
plan = make_plan(plan_id="plan-1")
|
||||||
|
plan_result = make_plan_result()
|
||||||
|
|
||||||
|
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
|
||||||
|
with patch_executor(plan_result):
|
||||||
|
result = await engine.execute(
|
||||||
|
messages=[{"role": "user", "content": "do a complex task"}],
|
||||||
|
)
|
||||||
|
|
||||||
|
assert isinstance(result, ReActResult)
|
||||||
|
assert result.status == "success"
|
||||||
|
assert result.output # aggregated output present
|
||||||
|
|
||||||
|
|
||||||
|
# ── Edge cases ───────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestRejectionReplan:
|
||||||
|
"""User rejects -> replan with feedback -> new Spec -> review again."""
|
||||||
|
|
||||||
|
async def test_reject_then_approve_regenerates_spec(self, tmp_path: Path):
|
||||||
|
# First review rejects with feedback, second approves.
|
||||||
|
responses = [("rejected", "make it simpler"), ("approved", "")]
|
||||||
|
|
||||||
|
async def handler(spec_id, goal, steps):
|
||||||
|
return responses.pop(0)
|
||||||
|
|
||||||
|
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
|
||||||
|
plan1 = make_plan(plan_id="plan-1")
|
||||||
|
plan2 = make_plan(plan_id="plan-2", goal="test goal (simpler)")
|
||||||
|
plan_result = make_plan_result()
|
||||||
|
|
||||||
|
with patch.object(
|
||||||
|
engine._planner,
|
||||||
|
"generate_plan",
|
||||||
|
AsyncMock(side_effect=[plan1, plan2]),
|
||||||
|
):
|
||||||
|
with patch_executor(plan_result):
|
||||||
|
events = [
|
||||||
|
e
|
||||||
|
async for e in engine.execute_stream(
|
||||||
|
messages=[{"role": "user", "content": "do a complex task"}],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Two spec_created events (plan-1 then plan-2 after replan), two
|
||||||
|
# review requests, two review replies.
|
||||||
|
spec_created = [e for e in events if e.event_type == "spec_created"]
|
||||||
|
requests = [e for e in events if e.event_type == "spec_review_request"]
|
||||||
|
replies = [e for e in events if e.event_type == "spec_review_reply"]
|
||||||
|
assert len(spec_created) == 2
|
||||||
|
assert len(requests) == 2
|
||||||
|
assert len(replies) == 2
|
||||||
|
# The second review targets a new spec_id (replan produced plan-2).
|
||||||
|
assert requests[0].data["spec_id"] == "plan-1"
|
||||||
|
assert requests[1].data["spec_id"] == "plan-2"
|
||||||
|
# First reply carries rejection + feedback; second carries approval.
|
||||||
|
assert replies[0].data["decision"] == "rejected"
|
||||||
|
assert replies[0].data["feedback"] == "make it simpler"
|
||||||
|
assert replies[1].data["decision"] == "approved"
|
||||||
|
# Execution resumed -> final_answer is success, not parked/failed.
|
||||||
|
final = next(e for e in events if e.event_type == "final_answer")
|
||||||
|
assert final.data["plan_status"] != "parked"
|
||||||
|
assert final.data["plan_status"] != "failed"
|
||||||
|
|
||||||
|
async def test_replan_cap_exhausted_fails(self, tmp_path: Path):
|
||||||
|
# Always reject: cap is 2 replans -> 3rd rejection exhausts the gate.
|
||||||
|
async def handler(spec_id, goal, steps):
|
||||||
|
return ("rejected", "still no good")
|
||||||
|
|
||||||
|
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
|
||||||
|
plans = [make_plan(plan_id=f"plan-{i}") for i in range(1, 6)]
|
||||||
|
plan_result = make_plan_result()
|
||||||
|
|
||||||
|
with patch.object(
|
||||||
|
engine._planner,
|
||||||
|
"generate_plan",
|
||||||
|
AsyncMock(side_effect=plans),
|
||||||
|
):
|
||||||
|
with patch_executor(plan_result):
|
||||||
|
events = [
|
||||||
|
e
|
||||||
|
async for e in engine.execute_stream(
|
||||||
|
messages=[{"role": "user", "content": "do a complex task"}],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
requests = [e for e in events if e.event_type == "spec_review_request"]
|
||||||
|
replies = [e for e in events if e.event_type == "spec_review_reply"]
|
||||||
|
# 3 reviews (initial + 2 replans), all rejected, then exhausted.
|
||||||
|
assert len(requests) == _MAX_SPEC_REVIEW_REPLANS + 1
|
||||||
|
assert all(r.data["decision"] == "rejected" for r in replies)
|
||||||
|
final = next(e for e in events if e.event_type == "final_answer")
|
||||||
|
assert final.data["plan_status"] == "failed"
|
||||||
|
assert "replan cap" in final.data["output"]
|
||||||
|
|
||||||
|
|
||||||
|
class TestTimeoutParked:
|
||||||
|
"""Timeout (30min simulated) -> Spec parked (not failed)."""
|
||||||
|
|
||||||
|
async def test_stream_timeout_parks_spec(self, tmp_path: Path):
|
||||||
|
async def handler(spec_id, goal, steps):
|
||||||
|
raise asyncio.TimeoutError
|
||||||
|
|
||||||
|
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
|
||||||
|
plan = make_plan(plan_id="plan-1")
|
||||||
|
plan_result = make_plan_result()
|
||||||
|
|
||||||
|
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
|
||||||
|
with patch_executor(plan_result):
|
||||||
|
events = [
|
||||||
|
e
|
||||||
|
async for e in engine.execute_stream(
|
||||||
|
messages=[{"role": "user", "content": "do a complex task"}],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Reply event carries decision=timeout + status=parked.
|
||||||
|
replies = [e for e in events if e.event_type == "spec_review_reply"]
|
||||||
|
assert len(replies) == 1
|
||||||
|
assert replies[0].data["decision"] == "timeout"
|
||||||
|
assert replies[0].data["status"] == "parked"
|
||||||
|
# final_answer surfaces parked (not failed).
|
||||||
|
final = next(e for e in events if e.event_type == "final_answer")
|
||||||
|
assert final.data["plan_status"] == "parked"
|
||||||
|
# Spec persisted as parked.
|
||||||
|
spec = mgr.get("plan-1")
|
||||||
|
assert spec is not None
|
||||||
|
assert spec.status == "parked"
|
||||||
|
|
||||||
|
async def test_nonstream_timeout_returns_parked_status(self, tmp_path: Path):
|
||||||
|
async def handler(spec_id, goal, steps):
|
||||||
|
raise asyncio.TimeoutError
|
||||||
|
|
||||||
|
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
|
||||||
|
plan = make_plan(plan_id="plan-1")
|
||||||
|
plan_result = make_plan_result()
|
||||||
|
|
||||||
|
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
|
||||||
|
with patch_executor(plan_result):
|
||||||
|
result = await engine.execute(
|
||||||
|
messages=[{"role": "user", "content": "do a complex task"}],
|
||||||
|
)
|
||||||
|
|
||||||
|
assert isinstance(result, ReActResult)
|
||||||
|
assert result.status == "parked"
|
||||||
|
assert mgr.get("plan-1").status == "parked"
|
||||||
|
|
||||||
|
|
||||||
|
class TestCancellation:
|
||||||
|
"""Stream cancelled mid-review -> CancelledError propagates, no deadlock."""
|
||||||
|
|
||||||
|
async def test_handler_cancelled_propagates(self, tmp_path: Path):
|
||||||
|
async def handler(spec_id, goal, steps):
|
||||||
|
raise asyncio.CancelledError
|
||||||
|
|
||||||
|
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
|
||||||
|
plan = make_plan(plan_id="plan-1")
|
||||||
|
plan_result = make_plan_result()
|
||||||
|
|
||||||
|
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
|
||||||
|
with patch_executor(plan_result):
|
||||||
|
with pytest.raises(asyncio.CancelledError):
|
||||||
|
async for _ in engine.execute_stream(
|
||||||
|
messages=[{"role": "user", "content": "do a complex task"}],
|
||||||
|
):
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def test_token_cancelled_before_gate_raises_task_cancelled(self, tmp_path: Path):
|
||||||
|
async def handler(spec_id, goal, steps): # pragma: no cover - never reached
|
||||||
|
return ("approved", "")
|
||||||
|
|
||||||
|
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
|
||||||
|
token = CancellationToken()
|
||||||
|
token.cancel()
|
||||||
|
plan = make_plan(plan_id="plan-1")
|
||||||
|
plan_result = make_plan_result()
|
||||||
|
|
||||||
|
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
|
||||||
|
with patch_executor(plan_result):
|
||||||
|
with pytest.raises(TaskCancelledError):
|
||||||
|
async for _ in engine.execute_stream(
|
||||||
|
messages=[{"role": "user", "content": "do a complex task"}],
|
||||||
|
cancellation_token=token,
|
||||||
|
):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class TestBackwardCompat:
|
||||||
|
"""spec_review_handler None -> no gate; spec_manager None + handler -> skip."""
|
||||||
|
|
||||||
|
async def test_handler_none_skips_gate(self, tmp_path: Path):
|
||||||
|
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=None)
|
||||||
|
plan = make_plan(plan_id="plan-1")
|
||||||
|
plan_result = make_plan_result()
|
||||||
|
|
||||||
|
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
|
||||||
|
with patch_executor(plan_result):
|
||||||
|
events = [
|
||||||
|
e
|
||||||
|
async for e in engine.execute_stream(
|
||||||
|
messages=[{"role": "user", "content": "do a complex task"}],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
event_types = [e.event_type for e in events]
|
||||||
|
# Spec still created, but no review gate events.
|
||||||
|
assert "spec_created" in event_types
|
||||||
|
assert "spec_review_request" not in event_types
|
||||||
|
assert "spec_review_reply" not in event_types
|
||||||
|
assert "final_answer" in event_types
|
||||||
|
|
||||||
|
async def test_spec_manager_none_handler_set_skips_gate(self, tmp_path: Path):
|
||||||
|
# handler set but spec_manager None -> gate skipped with a warning,
|
||||||
|
# execution proceeds (no crash, no spec_review events).
|
||||||
|
async def handler(spec_id, goal, steps): # pragma: no cover - never reached
|
||||||
|
return ("approved", "")
|
||||||
|
|
||||||
|
engine = PlanExecEngine(llm_gateway=None, spec_manager=None, spec_review_handler=handler)
|
||||||
|
plan = make_plan(plan_id="plan-1")
|
||||||
|
plan_result = make_plan_result()
|
||||||
|
|
||||||
|
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
|
||||||
|
with patch_executor(plan_result):
|
||||||
|
events = [
|
||||||
|
e
|
||||||
|
async for e in engine.execute_stream(
|
||||||
|
messages=[{"role": "user", "content": "do a complex task"}],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
event_types = [e.event_type for e in events]
|
||||||
|
assert "spec_created" not in event_types # no spec_manager -> no spec
|
||||||
|
assert "spec_review_request" not in event_types
|
||||||
|
assert "final_answer" in event_types
|
||||||
|
|
||||||
|
|
||||||
|
# ── Error / failure paths ────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestHandlerRaises:
|
||||||
|
"""Handler raises a non-timeout/cancel exception -> propagated."""
|
||||||
|
|
||||||
|
async def test_handler_value_error_propagates(self, tmp_path: Path):
|
||||||
|
async def handler(spec_id, goal, steps):
|
||||||
|
raise ValueError("handler blew up")
|
||||||
|
|
||||||
|
engine, mgr = make_engine(str(tmp_path / "specs"), spec_review_handler=handler)
|
||||||
|
plan = make_plan(plan_id="plan-1")
|
||||||
|
plan_result = make_plan_result()
|
||||||
|
|
||||||
|
with patch.object(engine._planner, "generate_plan", AsyncMock(return_value=plan)):
|
||||||
|
with patch_executor(plan_result):
|
||||||
|
with pytest.raises(ValueError, match="handler blew up"):
|
||||||
|
async for _ in engine.execute_stream(
|
||||||
|
messages=[{"role": "user", "content": "do a complex task"}],
|
||||||
|
):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class TestUnknownSpecReviewId:
|
||||||
|
"""An unknown spec_review_id is ignored (no crash) — mirrors the WS loop."""
|
||||||
|
|
||||||
|
def test_unknown_id_ignored(self):
|
||||||
|
# Replicates the chat.py WS-loop guard: only known ids resolve a future.
|
||||||
|
pending: dict[str, asyncio.Future] = {}
|
||||||
|
loop = asyncio.new_event_loop()
|
||||||
|
try:
|
||||||
|
fut: asyncio.Future = loop.create_future()
|
||||||
|
pending["known-id"] = fut
|
||||||
|
# An unknown id must not raise (the loop logs + ignores).
|
||||||
|
unknown = "does-not-exist"
|
||||||
|
assert unknown not in pending # the guard the loop uses
|
||||||
|
# Known id resolves fine.
|
||||||
|
assert "known-id" in pending
|
||||||
|
finally:
|
||||||
|
loop.close()
|
||||||
|
|
||||||
|
|
||||||
|
# ── SpecManager integration ──────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestSpecManagerParkResume:
|
||||||
|
"""park()/resume() round-trip; parked survives reload; confirm() works."""
|
||||||
|
|
||||||
|
def test_park_sets_status_parked(self, tmp_path: Path):
|
||||||
|
mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
|
||||||
|
mgr.create(make_spec(spec_id="s1"))
|
||||||
|
parked = mgr.park("s1")
|
||||||
|
assert parked is not None
|
||||||
|
assert parked.status == "parked"
|
||||||
|
|
||||||
|
def test_resume_sets_status_draft(self, tmp_path: Path):
|
||||||
|
mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
|
||||||
|
mgr.create(make_spec(spec_id="s1"))
|
||||||
|
mgr.park("s1")
|
||||||
|
resumed = mgr.resume("s1")
|
||||||
|
assert resumed is not None
|
||||||
|
assert resumed.status == "draft"
|
||||||
|
|
||||||
|
def test_resume_non_parked_is_noop(self, tmp_path: Path):
|
||||||
|
# ponytail: idempotent resume — no-op (returns spec unchanged) rather
|
||||||
|
# than raising on a double-resume.
|
||||||
|
mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
|
||||||
|
mgr.create(make_spec(spec_id="s1"))
|
||||||
|
# status is "draft", not "parked" -> resume is a no-op.
|
||||||
|
result = mgr.resume("s1")
|
||||||
|
assert result is not None
|
||||||
|
assert result.status == "draft"
|
||||||
|
|
||||||
|
def test_park_nonexistent_returns_none(self, tmp_path: Path):
|
||||||
|
mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
|
||||||
|
assert mgr.park("nope") is None
|
||||||
|
|
||||||
|
def test_resume_nonexistent_returns_none(self, tmp_path: Path):
|
||||||
|
mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
|
||||||
|
assert mgr.resume("nope") is None
|
||||||
|
|
||||||
|
def test_parked_survives_reload(self, tmp_path: Path):
|
||||||
|
# A fresh SpecManager instance loading from disk must see "parked".
|
||||||
|
specs_dir = str(tmp_path / "specs")
|
||||||
|
mgr1 = SpecManager(specs_dir=specs_dir)
|
||||||
|
mgr1.create(make_spec(spec_id="s1"))
|
||||||
|
mgr1.park("s1")
|
||||||
|
|
||||||
|
mgr2 = SpecManager(specs_dir=specs_dir)
|
||||||
|
loaded = mgr2.get("s1")
|
||||||
|
assert loaded is not None
|
||||||
|
assert loaded.status == "parked"
|
||||||
|
|
||||||
|
def test_confirm_still_works(self, tmp_path: Path):
|
||||||
|
# Backward compat: the existing confirm() REST endpoint path.
|
||||||
|
mgr = SpecManager(specs_dir=str(tmp_path / "specs"))
|
||||||
|
mgr.create(make_spec(spec_id="s1"))
|
||||||
|
confirmed = mgr.confirm("s1")
|
||||||
|
assert confirmed is not None
|
||||||
|
assert confirmed.status == "confirmed"
|
||||||
|
assert confirmed.confirmed_at is not None
|
||||||
Loading…
Reference in New Issue