import asyncio import hmac import json import re import logging import os import uuid from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from fastapi import ( APIRouter, Depends, HTTPException, Request, WebSocket, WebSocketDisconnect, Security, ) from fastapi.security import APIKeyHeader, APIKeyQuery from pydantic import BaseModel from agentkit.core.config_driven import ConfigDrivenAgent from agentkit.core.event_queue import EventQueue from agentkit.core.protocol import Event, TaskEventType, TaskStatus, TurnEventType from agentkit.core.react import ReActEngine from agentkit.chat.skill_routing import ExecutionMode, SkillRoutingResult from agentkit.chat.request_preprocessor import RequestPreprocessor from agentkit.server.routes.evolution_dashboard import ( _experiences as _dashboard_experiences, DashboardExperience, _broadcast_event as _broadcast_dashboard_event, ) from agentkit.core.fallback import EMPTY_LLM_RESPONSE from agentkit.chat.sqlite_conversation_store import SqliteConversationStore from agentkit.server.task_store import InMemoryTaskStore from agentkit.session.models import MessageRole # ponytail: importing module-private helpers from chat.py because the frontend # WS connects to /api/v1/portal/ws (this router), not /api/v1/chat/ws/{session_id}. # Without this, @board/@team prefixes are never intercepted and board/team cards # never render. Upgrade path: extract these into a shared experts dispatch module. from agentkit.server.routes.chat import _execute_board_meeting, _execute_team_collab logger = logging.getLogger(__name__) router = APIRouter(tags=["portal"]) # Use a project-local SQLite file to avoid read-only sandbox restrictions on ~/.agentkit. _PROJECT_ROOT = Path(__file__).parents[4] _CONVERSATIONS_DB_PATH = Path( os.environ.get("AGENTKIT_CONVERSATIONS_DB", _PROJECT_ROOT / "data" / "conversations.db") ) _CONVERSATIONS_DB_PATH.parent.mkdir(parents=True, exist_ok=True) # Track background ReAct tasks so they are not garbage-collected mid-execution. # Tasks are removed automatically via add_done_callback when they complete. _running_background_tasks: set[asyncio.Task] = set() # --------------------------------------------------------------------------- # API Key Authentication # --------------------------------------------------------------------------- _api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False) _api_key_query = APIKeyQuery(name="api_key", auto_error=False) def _ensure_non_empty(text: str | None) -> str: """Ensure response text is never empty or whitespace-only.""" if text and text.strip(): return text return EMPTY_LLM_RESPONSE async def _emit_event_safe( event_queue: EventQueue | None, event_type: str, task_id: str, session_id: str, data: dict | None = None, ) -> None: """Emit an event to the EventQueue without blocking or raising. The EQ is a side-channel: emit failures must never break the WebSocket flow. All exceptions are swallowed and logged at warning level. Args: event_queue: The EventQueue to emit to (no-op if None) event_type: Event type (see TaskEventType / TurnEventType) task_id: Associated task ID session_id: Associated session ID (conversation_id) data: Optional event payload """ if event_queue is None: return try: event = Event.create( event_type=event_type, task_id=task_id, session_id=session_id, data=data or {}, ) await event_queue.emit(event) except (asyncio.QueueFull, RuntimeError, ConnectionError) as e: logger.warning(f"EventQueue emit failed (type={event_type}): {e}", exc_info=True) # P1 #14 fix: TaskStore sync/async compatibility shim. # InMemoryTaskStore methods are sync; RedisTaskStore methods are async. # These helpers detect and await coroutines so portal.py works with both. async def _task_store_create(store, *args, **kwargs): result = store.create(*args, **kwargs) if asyncio.iscoroutine(result): return await result return result async def _task_store_get(store, *args, **kwargs): result = store.get(*args, **kwargs) if asyncio.iscoroutine(result): return await result return result async def _task_store_update_status(store, *args, **kwargs): result = store.update_status(*args, **kwargs) if asyncio.iscoroutine(result): return await result return result async def _task_store_list_tasks(store, *args, **kwargs): result = store.list_tasks(*args, **kwargs) if asyncio.iscoroutine(result): return await result return result async def _verify_api_key( request: Request, api_key_header: str | None = Security(_api_key_header), api_key_query: str | None = Security(_api_key_query), ) -> None: """Verify API key for REST endpoints. Raises HTTPException if invalid.""" configured_api_key: str | None = None if hasattr(request.app.state, "server_config") and request.app.state.server_config: configured_api_key = request.app.state.server_config.api_key if configured_api_key is None and hasattr(request.app.state, "api_key"): configured_api_key = request.app.state.api_key # If no API key is configured, allow all requests (backwards compat) if configured_api_key is None: return provided = api_key_header or api_key_query if not hmac.compare_digest((provided or "").encode(), configured_api_key.encode()): raise HTTPException( status_code=401, detail="Invalid or missing API key. Provide via X-API-Key header or api_key query parameter.", ) # --------------------------------------------------------------------------- # In-memory Conversation Store # --------------------------------------------------------------------------- @dataclass class ChatMessage: role: str # "user" or "assistant" content: str timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) metadata: dict = field(default_factory=dict) @dataclass class Conversation: id: str messages: list[ChatMessage] = field(default_factory=list) created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) updated_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) # Heartbeat timeout in seconds — 0 disables timeout (for testing) _WS_HEARTBEAT_TIMEOUT = float(os.environ.get("AGENTKIT_WS_TIMEOUT", "120")) _conversation_store = SqliteConversationStore(db_path=_CONVERSATIONS_DB_PATH) class _ConvStoreAsSessionManager: """Adapt SqliteConversationStore to the SessionManager.append_message shape used by chat.py's _execute_board_meeting / _execute_team_collab. ponytail: only append_message is implemented — that's all the board/team intercepts call. If future logic needs more SessionManager methods, add them here then. """ def __init__(self, store: SqliteConversationStore) -> None: self._store = store async def append_message( self, session_id: str, role: MessageRole, content: str, tool_call_id: str | None = None, agent_name: str | None = None, metadata: dict[str, object] | None = None, ) -> None: role_str = role.value if hasattr(role, "value") else str(role) await self._store.add_message(session_id, role_str, content, metadata) _sm_adapter = _ConvStoreAsSessionManager(_conversation_store) # --------------------------------------------------------------------------- # Active portal WebSocket connections by user_id # --------------------------------------------------------------------------- class PortalConnectionManager: """Track active portal WebSocket connections by authenticated user_id. Used by the calendar reminder scheduler (and other user-scoped push features) to deliver real-time messages to a user's open chat tab(s). """ # ponytail: per-user connection cap prevents a single client from # exhausting memory via unbounded WS spawns. 16 covers typical # multi-tab usage. Upgrade path: make configurable via server_config. _MAX_CONNECTIONS_PER_USER = 16 def __init__(self) -> None: # user_id -> list of active WebSocket connections self._connections: dict[str, list[WebSocket]] = {} def add(self, user_id: str, ws: WebSocket) -> None: conns = self._connections.setdefault(user_id, []) if len(conns) >= self._MAX_CONNECTIONS_PER_USER: # Close the oldest connection to make room (FIFO eviction). oldest = conns.pop(0) try: # Best-effort close; ignore failures since the socket may # already be dead. import asyncio asyncio.create_task(oldest.close(code=1008, reason="Connection limit exceeded")) except (ConnectionError, RuntimeError): pass conns.append(ws) def remove(self, user_id: str, ws: WebSocket) -> None: conns = self._connections.get(user_id) if conns is None: return self._connections[user_id] = [w for w in conns if w is not ws] if not self._connections[user_id]: del self._connections[user_id] async def send_json(self, user_id: str, message: dict[str, object]) -> None: """Broadcast a JSON message to all connections for *user_id*. Removes stale connections that fail to send. """ conns = list(self._connections.get(user_id, [])) if not conns: return stale: list[WebSocket] = [] for ws in conns: try: await ws.send_json(message) except (ConnectionError, RuntimeError, asyncio.TimeoutError) as e: logger.debug("Portal WS send failed for user %s (marking stale): %s", user_id, e) stale.append(ws) for ws in stale: self.remove(user_id, ws) portal_connection_manager = PortalConnectionManager() async def send_to_user(user_id: str, message: dict[str, object]) -> None: """Public helper to push a message to all portal WebSockets for a user.""" await portal_connection_manager.send_json(user_id, message) # P1 #9 fix: ReAct event type -> TurnEventType mapping for EQ subscribers. # Preserves the original EQ contract so CLI and other subscribers that # filter on TurnEventType constants (e.g. 'turn.thinking') keep working. _REACT_EVENT_TYPE_MAP: dict[str, str] = { "thinking": TurnEventType.THINKING, "tool_call": TurnEventType.TOOL_CALL, "tool_result": TurnEventType.TOOL_RESULT, "token": TurnEventType.TOKEN, "final_answer": TurnEventType.FINAL_ANSWER, "error": TurnEventType.TURN_COMPLETED, # best-effort mapping "confirmation_request": TurnEventType.STEP, } # --------------------------------------------------------------------------- # History injection helper — configurable limit + optional compression # --------------------------------------------------------------------------- # Maximum history messages to inject (can be overridden by server config) _MAX_HISTORY_MESSAGES = 50 async def _build_history_messages( conv_id: str, limit: int = _MAX_HISTORY_MESSAGES, ) -> list[dict]: """Build conversation history messages for LLM context injection. Returns a list of {"role": "user"|"assistant", "content": ...} dicts representing the conversation history (excluding the current user message, which should be appended separately by the caller). """ try: history = await _conversation_store.get_history(conv_id, limit=limit) except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError): return [] # The last message in history is the current user message (just added), # so skip it to avoid duplication. messages = [] for hist_msg in history[:-1]: if hist_msg.role in ("user", "assistant"): messages.append({"role": hist_msg.role, "content": hist_msg.content}) return messages # --------------------------------------------------------------------------- # Capability mapping # --------------------------------------------------------------------------- CAPABILITY_CATEGORIES: dict[str, dict[str, str]] = { "chat": { "display_name": "智能对话", "description": "自然语言交互,自动路由到对应能力", "icon": "MessageOutlined", }, "workflow": { "display_name": "工作流编排", "description": "可视化拖拽编排工作流", "icon": "ApartmentOutlined", }, "knowledge": { "display_name": "知识库", "description": "文档摄取、语义检索、多源RAG", "icon": "BookOutlined", }, "skills": { "display_name": "技能管理", "description": "浏览和管理已注册的技能", "icon": "AppstoreOutlined", }, "terminal": { "display_name": "智能终端", "description": "交互式终端会话和命令执行", "icon": "CodeOutlined", }, "computer_use": { "display_name": "Computer Use", "description": "UI自动化操作和截屏识别", "icon": "DesktopOutlined", }, "evolution": { "display_name": "自进化", "description": "经验积累、避坑预警、路径优化", "icon": "RiseOutlined", }, "settings": { "display_name": "系统设置", "description": "配置LLM、技能、知识库连接", "icon": "SettingOutlined", }, } # --------------------------------------------------------------------------- # Request / Response models # --------------------------------------------------------------------------- class ChatRequest(BaseModel): message: str conversation_id: str | None = None sources: list[str] | None = None skill_name: str | None = None class ChatResponse(BaseModel): conversation_id: str message: str timestamp: str matched_skill: str | None = None routing_method: str | None = None confidence: float | None = None task_id: str | None = None status: str = "completed" class CapabilityInfo(BaseModel): name: str display_name: str description: str icon: str enabled: bool skill_count: int class CapabilitiesResponse(BaseModel): capabilities: list[CapabilityInfo] # --------------------------------------------------------------------------- # Helper: resolve agent + skill for a chat request # --------------------------------------------------------------------------- async def _resolve_for_chat( request: ChatRequest, req: Request ) -> tuple[ ConfigDrivenAgent | None, SkillRoutingResult | None, str | None, str | None, float | None ]: """Resolve agent and routing for a chat request via RequestPreprocessor. Returns (agent, routing_result, matched_skill_name, routing_method, confidence). """ pool = req.app.state.agent_pool skill_registry = req.app.state.skill_registry request_preprocessor: RequestPreprocessor = req.app.state.request_preprocessor matched_skill_name: str | None = None routing_method: str | None = None confidence: float | None = None # Get default tools and system prompt default_tools = [] default_system_prompt = None default_agent = pool.get_agent("default") if default_agent is not None: default_tools = default_agent.get_tools() default_system_prompt = ( getattr(default_agent, "_system_prompt", None) or default_agent.get_system_prompt() ) else: all_skills = skill_registry.list_skills() for skill in all_skills: agent = pool.get_agent(skill.name) if agent is not None: default_tools = agent.get_tools() default_system_prompt = ( getattr(agent, "_system_prompt", None) or agent.get_system_prompt() ) break # If skill_name is explicitly provided in the request, use it directly if request.skill_name: routing_result = await request_preprocessor.preprocess( content=f"@skill:{request.skill_name} {request.message}", skill_registry=skill_registry, default_tools=default_tools, default_system_prompt=default_system_prompt, default_model="default", default_agent_name="default", ) else: # Preprocess via RequestPreprocessor (minimal: @skill prefix + greeting regex + REACT) routing_result = await request_preprocessor.preprocess( content=request.message, skill_registry=skill_registry, default_tools=default_tools, default_system_prompt=default_system_prompt, default_model="default", default_agent_name="default", ) matched_skill_name = routing_result.skill_name or routing_result.agent_name routing_method = routing_result.match_method confidence = routing_result.match_confidence # Get or create agent based on routing result if routing_result.matched and routing_result.skill_name: agent = pool.get_agent(routing_result.skill_name) if agent is None: agent = await pool.create_agent_from_skill(routing_result.skill_name) else: agent = pool.get_agent("default") if agent is None: # Fallback: try to create from first available skill all_skills = skill_registry.list_skills() if all_skills: agent = await pool.create_agent_from_skill(all_skills[0].name) return agent, routing_result, matched_skill_name, routing_method, confidence # --------------------------------------------------------------------------- # Endpoints # --------------------------------------------------------------------------- @router.post("/portal/chat", response_model=ChatResponse) async def chat(request: ChatRequest, req: Request, _auth: None = Depends(_verify_api_key)): """Send a chat message and get a response with RequestPreprocessor routing.""" # If skill_name is explicitly requested but not found, return 404 if request.skill_name: skill_registry = req.app.state.skill_registry if not skill_registry.has_skill(request.skill_name): raise HTTPException(status_code=404, detail=f"Skill '{request.skill_name}' not found") agent, routing_result, matched_skill, routing_method, confidence = await _resolve_for_chat( request, req ) # Create or reuse conversation conv = await _conversation_store.get_or_create(request.conversation_id) await _conversation_store.add_message(conv.id, "user", request.message) llm_gateway = req.app.state.llm_gateway task_id = str(uuid.uuid4()) response_text = "" if routing_result is not None and routing_result.execution_mode == ExecutionMode.DIRECT_CHAT: # DIRECT_CHAT: direct LLM call, no ReAct loop (same as WebSocket path) chat_messages = [] if routing_result.system_prompt: chat_messages.append({"role": "system", "content": routing_result.system_prompt}) chat_messages.append({"role": "user", "content": request.message}) # Inject conversation history history_msgs = await _build_history_messages(conv.id) for hm in history_msgs: chat_messages.insert(-1, hm) response = await llm_gateway.chat( messages=chat_messages, model=routing_result.model or "default", agent_name="default", task_type="chat", ) response_text = _ensure_non_empty(response.content) else: # REACT / SKILL_REACT / REWOO / REFLEXION / PLAN_EXEC / TEAM_COLLAB # Advanced modes (REWOO, REFLEXION, PLAN_EXEC, TEAM_COLLAB) currently # fall back to REACT with a warning. Full integration is tracked separately. if routing_result is not None and routing_result.execution_mode not in ( ExecutionMode.REACT, ExecutionMode.SKILL_REACT, ): logger.warning( f"Execution mode {routing_result.execution_mode.value} not yet supported " f"in portal REST, falling back to REACT" ) react_config = agent.get_react_config() react_engine = getattr(agent, "_react_engine", None) if react_engine is None: react_engine = ReActEngine( llm_gateway=llm_gateway, max_steps=react_config["max_steps"], ) else: react_engine.reset() messages = [{"role": "user", "content": request.message}] # Inject conversation history history_msgs = await _build_history_messages(conv.id) for hm in reversed(history_msgs): messages.insert(0, hm) tools = agent.get_tools() model = agent.get_model() system_prompt = getattr(agent, "_system_prompt", None) or agent.get_system_prompt() timeout_seconds = react_config["timeout_seconds"] collected_output: list[str] = [] try: async for event in react_engine.execute_stream( messages=messages, tools=tools, model=model, agent_name=agent.name, system_prompt=system_prompt, timeout_seconds=timeout_seconds, ): if event.event_type == "final_answer": collected_output.append(event.data.get("output", "")) except asyncio.CancelledError: raise except Exception as e: response_text = f"执行出错: {e}" else: response_text = _ensure_non_empty( "".join(collected_output) if collected_output else None ) await _conversation_store.add_message(conv.id, "assistant", response_text) return ChatResponse( conversation_id=conv.id, message=response_text, timestamp=datetime.now(timezone.utc).isoformat(), matched_skill=matched_skill, routing_method=routing_method, confidence=confidence, task_id=task_id, status="completed", ) @router.post("/portal/chat/stream") async def chat_stream(request: ChatRequest, req: Request, _auth: None = Depends(_verify_api_key)): """Stream chat responses via SSE with RequestPreprocessor routing.""" from sse_starlette.sse import EventSourceResponse agent, routing_result, matched_skill, routing_method, confidence = await _resolve_for_chat( request, req ) # Create or reuse conversation conv = await _conversation_store.get_or_create(request.conversation_id) await _conversation_store.add_message(conv.id, "user", request.message) llm_gateway = req.app.state.llm_gateway async def event_generator(): # Send routing info as first event yield { "event": "routing", "data": json.dumps( { "skill": matched_skill, "method": routing_method, "confidence": confidence, } ), } if ( routing_result is not None and routing_result.execution_mode == ExecutionMode.DIRECT_CHAT ): # DIRECT_CHAT: direct LLM call, no ReAct loop chat_messages = [] if routing_result.system_prompt: chat_messages.append({"role": "system", "content": routing_result.system_prompt}) chat_messages.append({"role": "user", "content": request.message}) history_msgs = await _build_history_messages(conv.id) for hm in history_msgs: chat_messages.insert(-1, hm) response = await llm_gateway.chat( messages=chat_messages, model=routing_result.model or "default", agent_name="default", task_type="chat", ) response_text = _ensure_non_empty(response.content) await _conversation_store.add_message(conv.id, "assistant", response_text) yield { "event": "final_answer", "data": json.dumps( { "step": 0, "data": {"output": response_text}, "timestamp": datetime.now(timezone.utc).isoformat(), } ), } else: # REACT / SKILL_REACT / REWOO / REFLEXION / PLAN_EXEC / TEAM_COLLAB # Advanced modes fall back to REACT with a warning. if routing_result is not None and routing_result.execution_mode not in ( ExecutionMode.REACT, ExecutionMode.SKILL_REACT, ): logger.warning( f"Execution mode {routing_result.execution_mode.value} not yet supported " f"in portal SSE, falling back to REACT" ) react_config = agent.get_react_config() react_engine = getattr(agent, "_react_engine", None) if react_engine is None: react_engine = ReActEngine( llm_gateway=llm_gateway, max_steps=react_config["max_steps"], ) else: react_engine.reset() messages = [{"role": "user", "content": request.message}] tools = agent.get_tools() model = agent.get_model() system_prompt = getattr(agent, "_system_prompt", None) or agent.get_system_prompt() timeout_seconds = react_config["timeout_seconds"] collected_output: list[str] = [] try: async for event in react_engine.execute_stream( messages=messages, tools=tools, model=model, agent_name=agent.name, system_prompt=system_prompt, timeout_seconds=timeout_seconds, ): if event.event_type == "final_answer": collected_output.append(event.data.get("output", "")) yield { "event": event.event_type, "data": json.dumps( { "step": event.step, "data": event.data, "timestamp": event.timestamp, } ), } except asyncio.CancelledError: raise except Exception as e: yield { "event": "error", "data": json.dumps({"error": str(e)}), } return response_text = _ensure_non_empty( "".join(collected_output) if collected_output else None ) await _conversation_store.add_message(conv.id, "assistant", response_text) return EventSourceResponse(event_generator()) @router.get("/portal/capabilities", response_model=CapabilitiesResponse) async def get_capabilities(req: Request, _auth: None = Depends(_verify_api_key)): """List all available capabilities with their status.""" skill_registry = req.app.state.skill_registry all_skills = skill_registry.list_skills() # Build a map of capability tag -> skill count cap_skill_counts: dict[str, int] = {} for skill in all_skills: for cap in skill.capabilities: cap_skill_counts[cap.tag] = cap_skill_counts.get(cap.tag, 0) + 1 # Also count the skill itself toward "skills" category cap_skill_counts["skills"] = cap_skill_counts.get("skills", 0) + 1 capabilities: list[CapabilityInfo] = [] for cat_name, cat_info in CAPABILITY_CATEGORIES.items(): skill_count = cap_skill_counts.get(cat_name, 0) capabilities.append( CapabilityInfo( name=cat_name, display_name=cat_info["display_name"], description=cat_info["description"], icon=cat_info["icon"], enabled=True, skill_count=skill_count, ) ) return CapabilitiesResponse(capabilities=capabilities) @router.get("/portal/conversations") async def list_conversations(limit: int = 20, _auth: None = Depends(_verify_api_key)): """List recent conversations. For each conversation, derive the title from the first user message read directly from SQLite (independent of the in-memory cache, which may have an empty `messages` list after a restart). This prevents the regression where titles collapse to the placeholder "对话". Also tags each conversation with ``is_board`` so the sidebar can show a "私董会" badge without having to fetch every conversation's full history. The check is a cheap metadata LIKE query — the list endpoint stays O(limit) even for hundreds of conversations. """ convs = await _conversation_store.list_conversations(limit=limit) result: list[dict] = [] for c in convs: # Re-derive title from the persisted user message so cache misses # after a restart don't surface the default placeholder. first_user = await _conversation_store.get_first_user_message(c.id) title = _derive_conversation_title_from_content(first_user.content if first_user else None) is_board = await _conversation_has_board_started(c.id) result.append( { "id": c.id, "title": title, "created_at": c.created_at.isoformat(), "updated_at": c.updated_at.isoformat(), "message_count": len(c.messages), "is_board": is_board, } ) return result async def _conversation_has_board_started(conversation_id: str) -> bool: """Return True if the conversation contains a persisted board_started event. Used by the sidebar list so it can render the "私董会" badge without fetching the full message history. Reads from SQLite directly — the in-memory cache may not be populated after a server restart. Returns False on any storage error so the badge never blocks the list endpoint. """ try: return await _conversation_store.has_message_with_type( conversation_id, "board_started" ) except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError): logger.warning("is_board lookup failed for %s", conversation_id, exc_info=True) return False def _derive_conversation_title(conv: Conversation) -> str: """Derive a human-readable title from the first user message in the conversation object.""" for msg in conv.messages: if msg.role == "user" and msg.content: return msg.content[:20] + ("..." if len(msg.content) > 20 else "") return "对话" _COMMAND_PREFIX_RE = re.compile( r"^@(?:board|team)(?::[^\s]+)?(?:\s+rounds=\d+)?\s*", re.IGNORECASE, ) def _strip_command_prefix(content: str) -> str: """Strip a leading @board/@team command so conversation titles show only the topic. Examples: "@board:warren,charlie 怎么看 AI" → "怎么看 AI" "@team 私董会" → "私董会" "@board rounds=3 软件行业" → "软件行业" """ if not content: return "" return _COMMAND_PREFIX_RE.sub("", content, count=1).strip() def _derive_conversation_title_from_content(content: str | None) -> str: """Derive title from a string content (used when conv.messages is empty).""" if content: cleaned = _strip_command_prefix(content) if cleaned: return cleaned[:20] + ("..." if len(cleaned) > 20 else "") return "对话" @router.get("/portal/conversations/{conversation_id}") async def get_conversation( conversation_id: str, limit: int = 50, _auth: None = Depends(_verify_api_key) ): """Get conversation history from SQLite-backed store. Title is derived from the first user message in `history` (the authoritative source) rather than the in-memory cache, which may have an empty `messages` list after a server restart. This prevents the regression where selecting a conversation collapses the title to "对话". """ history = await _conversation_store.get_history(conversation_id, limit=limit) if not history: raise HTTPException(status_code=404, detail=f"Conversation '{conversation_id}' not found") conv = await _conversation_store.get_or_create(conversation_id) first_user_content = next( (m.content for m in history if m.role == "user" and m.content), None, ) return { "id": conv.id, "title": _derive_conversation_title_from_content(first_user_content), "messages": [_hydrate_persisted_message(conv.id, i, m) for i, m in enumerate(history)], "created_at": conv.created_at.isoformat(), "updated_at": conv.updated_at.isoformat(), "is_board": any( (m.metadata or {}).get("message_type") == "board_started" for m in history ), } # Fields we store inside Message.metadata to reconstruct board_* messages # after a page reload. The Message dataclass only has role/content/timestamp # as first-class columns; everything else (message_type, expert identity, # board round/role, conclusion payload) rides along in metadata. _PERSISTED_MESSAGE_FIELDS = ( "message_type", "expert_id", "expert_name", "expert_avatar", "expert_color", "board_round", "board_role", "board_conclusion", "board_started", "matched_skill", "confidence", "routing_method", "thinking", "tool_calls", ) def _hydrate_persisted_message(conv_id: str, index: int, msg) -> dict: """Build the API response dict for a single persisted message. Promotes well-known rendering fields from ``msg.metadata`` to the top level so the frontend can render board_speech / board_summary / board_conclusion cards after a reload — without this, every restored assistant message would look like a plain chat bubble. """ payload: dict = { "id": f"{conv_id}-{index}", "role": msg.role, "content": msg.content, "timestamp": msg.timestamp.isoformat(), "metadata": getattr(msg, "metadata", None) or {}, } meta = payload["metadata"] if not isinstance(meta, dict): return payload for key in _PERSISTED_MESSAGE_FIELDS: if key in meta and meta[key] is not None: payload[key] = meta[key] return payload @router.delete("/portal/conversations/{conversation_id}") async def delete_conversation(conversation_id: str, _auth: None = Depends(_verify_api_key)): """Delete a conversation and all its messages. ponytail: IDOR note — portal endpoints use API-key auth (single-tenant access model: API key = full access to all conversations). The SQLite store has no user_id column, so per-user ownership cannot be enforced without a schema migration. If API keys become per-user, add a user_id column to conversations + filter DELETE by (id, user_id). Upgrade path: migrate portal endpoints to JWT auth + per-user scoping. """ deleted = await _conversation_store.delete_conversation(conversation_id) if not deleted: raise HTTPException(status_code=404, detail=f"Conversation '{conversation_id}' not found") return {"deleted": True, "id": conversation_id} def _derive_title_from_messages(messages: list) -> str: """Derive title from a list of Message objects (SessionManager format).""" for msg in messages: if msg.role.value == "user" and msg.content: return msg.content[:20] + ("..." if len(msg.content) > 20 else "") return "对话" async def _execute_react_background( react_engine: ReActEngine, messages: list[dict], tools: list, model: str, agent_name: str, system_prompt: str | None, timeout_seconds: float | None, conv_id: str, task_id: str, event_queue: EventQueue, conversation_store: SqliteConversationStore, task_store: InMemoryTaskStore | None = None, ) -> None: """Execute ReAct engine in the background, decoupled from WebSocket lifecycle. Events are emitted to the EventQueue (filtered by task_id) so that any subscriber — including a reconnected WebSocket — can consume them. Results are always persisted to the conversation store, regardless of whether a WebSocket subscriber is active. Task status is tracked in TaskStore when provided. """ collected_output: list[str] = [] try: if task_store is not None: try: await _task_store_update_status( task_store, task_id, TaskStatus.RUNNING, started_at=datetime.now(timezone.utc) ) except ( ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError, ): logger.warning("Failed to update TaskStore RUNNING", exc_info=True) async for event in react_engine.execute_stream( messages=messages, tools=tools, model=model, agent_name=agent_name, system_prompt=system_prompt, timeout_seconds=timeout_seconds, ): if event.event_type == "final_answer": collected_output.append(event.data.get("output", "")) # P1 #8/#9/#10 fix: Preserve TurnEventType mapping, step field, # and original data structure for EQ subscriber compatibility. # Note: Event dataclass has no 'step' field; use getattr for # compatibility with ReActEngine events that may include it. _turn_event_type = _REACT_EVENT_TYPE_MAP.get(event.event_type) if _turn_event_type is not None: await _emit_event_safe( event_queue, _turn_event_type, task_id=task_id, session_id=conv_id, data={ **event.data, "step": getattr(event, "step", 0), "timestamp": event.timestamp, }, ) # Normal completion: persist result response_text = _ensure_non_empty("".join(collected_output) if collected_output else None) await conversation_store.add_message(conv_id, "assistant", response_text) if task_store is not None: try: await _task_store_update_status( task_store, task_id, TaskStatus.COMPLETED, output_data={"output": response_text}, completed_at=datetime.now(timezone.utc), progress=1.0, progress_message="Completed", ) except ( ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError, ): logger.warning("Failed to update TaskStore COMPLETED", exc_info=True) # Emit task.completed so subscribers know the task is done await _emit_event_safe( event_queue, TaskEventType.TASK_COMPLETED, task_id=task_id, session_id=conv_id, data={"output": response_text, "timestamp": datetime.now(timezone.utc).isoformat()}, ) except asyncio.CancelledError: # Application shutdown or explicit cancel — persist partial output # and mark task as FAILED so resume does not block forever. # P0 #1/#2 fix: ALL persistence operations must use asyncio.shield # and the async TaskStore shim. Without shield, a re-entrant # cancellation kills the cleanup itself; without the shim, # RedisTaskStore (async) silently drops the coroutine. if collected_output: partial = _ensure_non_empty("".join(collected_output)) try: await asyncio.shield(conversation_store.add_message(conv_id, "assistant", partial)) except ( asyncio.CancelledError, ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError, ): logger.warning("Failed to persist partial output on cancel") if task_store is not None: try: await asyncio.shield( _task_store_update_status( task_store, task_id, TaskStatus.FAILED, error_message="Task cancelled", completed_at=datetime.now(timezone.utc), ) ) except ( asyncio.CancelledError, ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError, ): logger.warning("Failed to update TaskStore on cancel", exc_info=True) # P0 #2 fix: _emit_event_safe is async (it awaits event_queue.emit). # Shield it so a re-entrant CancelledError doesn't kill the emit # and leave subscribers blocked until timeout. try: await asyncio.shield( _emit_event_safe( event_queue, TaskEventType.TASK_FAILED, task_id=task_id, session_id=conv_id, data={ "error": "Task cancelled", "timestamp": datetime.now(timezone.utc).isoformat(), }, ) ) except (asyncio.CancelledError, asyncio.QueueFull, RuntimeError, ConnectionError): logger.warning("Failed to emit TASK_FAILED on cancel") raise # Propagate cancellation except Exception as e: # Persist any partial output collected before the error if collected_output: partial = _ensure_non_empty("".join(collected_output)) try: await conversation_store.add_message(conv_id, "assistant", partial) except ( ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError, ): logger.warning("Failed to persist partial output in background task") if task_store is not None: try: await _task_store_update_status( task_store, task_id, TaskStatus.FAILED, error_message=str(e), completed_at=datetime.now(timezone.utc), ) except ( ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError, ): logger.warning("Failed to update TaskStore FAILED", exc_info=True) # Emit task.failed so subscribers know the task failed await _emit_event_safe( event_queue, TaskEventType.TASK_FAILED, task_id=task_id, session_id=conv_id, data={"error": str(e), "timestamp": datetime.now(timezone.utc).isoformat()}, ) @router.websocket("/portal/ws") async def portal_websocket(websocket: WebSocket): """Real-time chat WebSocket endpoint.""" await websocket.accept() # ponytail: ws_user_id must be initialized before any early return — the # finally block below references it. Previously the api_key reject path # returned before assignment, causing UnboundLocalError that masked the # original auth error. Upgrade path: refactor auth into a decorator. ws_user_id: str | None = None # Authentication (after accept, since FastAPI requires accept before close) configured_api_key: str | None = None if hasattr(websocket.app.state, "server_config") and websocket.app.state.server_config: configured_api_key = websocket.app.state.server_config.api_key if configured_api_key is None and hasattr(websocket.app.state, "api_key"): configured_api_key = websocket.app.state.api_key # Check api_key query param if configured_api_key: provided = websocket.query_params.get("api_key") if not hmac.compare_digest((provided or "").encode(), configured_api_key.encode()): await websocket.send_json( {"type": "error", "data": {"message": "Invalid or missing api_key"}} ) await websocket.close(code=4001, reason="Invalid or missing api_key") return # Track authenticated portal connections for user-scoped push (calendar # reminders, etc.). user_id is None for API-key / dev-mode clients. current_user = getattr(websocket.state, "current_user", None) or {} ws_user_id = current_user.get("user_id") if ws_user_id: portal_connection_manager.add(ws_user_id, websocket) # Wait for first chat message before creating conversation conv: Conversation | None = None # task_id is per-user-message; tracked here so the outer except can emit task.failed task_id: str | None = None # Track the active background task so cancel can propagate to it. active_bg_task: asyncio.Task | None = None try: while True: try: timeout = _WS_HEARTBEAT_TIMEOUT if _WS_HEARTBEAT_TIMEOUT > 0 else None raw = await asyncio.wait_for(websocket.receive_text(), timeout=timeout) except asyncio.TimeoutError: await websocket.close(code=1000, reason="Heartbeat timeout") return try: msg = json.loads(raw) except json.JSONDecodeError: continue msg_type = msg.get("type") if msg_type == "cancel": # Cancel the active background task if still running if active_bg_task is not None and not active_bg_task.done(): active_bg_task.cancel() active_bg_task = None await websocket.send_json( { "type": "result", "data": { "status": "cancelled", "timestamp": datetime.now(timezone.utc).isoformat(), }, } ) return if msg_type == "ping": await websocket.send_json({"type": "pong"}) continue if msg_type == "resume": # Frontend reconnected and wants to resume a running task resume_task_id = msg.get("task_id", "") if not resume_task_id: continue # P1 #3/#4 fix: Fail-closed ownership verification. # Require conversation_id and TaskStore — reject if either # is missing, to prevent cross-conversation task hijacking # via empty conversation_id or unconfigured TaskStore. resume_conv_id = msg.get("conversation_id", "") if not resume_conv_id: await websocket.send_json( { "type": "error", "data": { "message": "Resume requires conversation_id.", "task_id": resume_task_id, }, } ) continue resume_task_store: InMemoryTaskStore | None = getattr( websocket.app.state, "task_store", None ) resume_eq: EventQueue | None = getattr(websocket.app.state, "event_queue", None) # P1 #4: Fail-closed if TaskStore is unavailable — cannot # verify ownership without it. if resume_task_store is None: await websocket.send_json( { "type": "error", "data": { "message": "Resume not supported (TaskStore unavailable). Please retry your request.", "task_id": resume_task_id, }, } ) continue try: record = await _task_store_get(resume_task_store, resume_task_id) except ( ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError, ): logger.warning("TaskStore.get failed during resume", exc_info=True) record = None if record is not None: # P1 #3: Fail-closed ownership check — reject if # conversation_id is missing from task metadata OR # does not match the request. task_conv_id = (record.metadata or {}).get("conversation_id", "") if not task_conv_id or resume_conv_id != task_conv_id: logger.warning( "Resume rejected: conversation_id mismatch " "(task=%s, request=%s, task_id=%s)", task_conv_id, resume_conv_id, resume_task_id, ) await websocket.send_json( { "type": "error", "data": { "message": "Task does not belong to this conversation.", "task_id": resume_task_id, }, } ) continue if record.status == TaskStatus.COMPLETED: # Task already finished — send result immediately output = (record.output_data or {}).get("output", "") await websocket.send_json( { "type": "result", "data": { "message": output, "timestamp": record.completed_at.isoformat() if record.completed_at else datetime.now(timezone.utc).isoformat(), }, } ) continue elif record.status == TaskStatus.FAILED: await websocket.send_json( { "type": "error", "data": { "message": record.error_message or "Task failed", }, } ) continue else: # Task not found in store — cannot resume await websocket.send_json( { "type": "error", "data": { "message": "Task not found or has expired. Please retry your request.", "task_id": resume_task_id, }, } ) continue # Task is still running — subscribe to EventQueue for remaining events. # H6: if EventQueue is unavailable, inform the client instead of # silently continuing (which would leave the UI loading forever). if resume_eq is None: await websocket.send_json( { "type": "error", "data": { "message": "Resume not supported (EventQueue unavailable). Please retry your request.", }, } ) continue # C2: bound the subscribe loop with a timeout so a dead # background task cannot block resume forever. resume_timeout = _WS_HEARTBEAT_TIMEOUT * 10 if _WS_HEARTBEAT_TIMEOUT > 0 else 600 try: async with asyncio.timeout(resume_timeout): async for event in resume_eq.subscribe(task_id=resume_task_id): if event.event_type == TaskEventType.TASK_COMPLETED: response_text = event.data.get("output", EMPTY_LLM_RESPONSE) await websocket.send_json( { "type": "result", "data": { "message": response_text, "timestamp": event.data.get( "timestamp", datetime.now(timezone.utc).isoformat(), ), }, } ) break elif event.event_type == TaskEventType.TASK_FAILED: await websocket.send_json( { "type": "error", "data": { "message": event.data.get("error", "Unknown error"), }, } ) break else: # P1 #8/#10 fix: step and data are now # top-level fields in event.data. await websocket.send_json( { "type": "step", "data": { "event_type": event.event_type, "step": event.data.get("step", 0), "data": { k: v for k, v in event.data.items() if k not in ("step", "timestamp") }, "timestamp": event.data.get("timestamp", ""), }, } ) except TimeoutError: logger.warning(f"Resume subscribe timed out for task {resume_task_id}") await websocket.send_json( { "type": "error", "data": { "message": "Task resume timed out. Please retry your request.", "task_id": resume_task_id, }, } ) except RuntimeError as exc: # P1 #5: subscriber limit reached or EQ closed — send # a friendly error instead of terminating the connection. logger.warning("Resume subscribe failed for task %s: %s", resume_task_id, exc) await websocket.send_json( { "type": "error", "data": { "message": "Server busy, please retry shortly.", "task_id": resume_task_id, }, } ) continue if msg_type != "chat": continue message_text = msg.get("message", "") model_override = msg.get("model") # Frontend model selector if not message_text: continue # Create or switch conversation based on conversation_id from frontend conv_id = msg.get("conversation_id") if conv_id: if conv is None or conv.id != conv_id: conv = await _conversation_store.get_or_create(conv_id) await websocket.send_json({"type": "connected", "conversation_id": conv.id}) elif conv is None: conv = await _conversation_store.get_or_create(conv_id) await websocket.send_json({"type": "connected", "conversation_id": conv.id}) # @board / @team intercept — mirror chat.py:1076-1081. # Frontend WS connects to /api/v1/portal/ws (this router), not # /api/v1/chat/ws/{session_id} (chat.py). Without this intercept # @board/@team messages are treated as plain text and no # board_started/team_formed events are broadcast, so the cards # never render. Placed before task_id emit so intercepted # messages don't leave orphan tasks in the EQ side-channel. if await _execute_board_meeting(websocket, conv.id, message_text, _sm_adapter): continue if await _execute_team_collab(websocket, conv.id, message_text, _sm_adapter): continue # Generate task_id for this user message and emit task.created to EQ # (EQ is a side-channel: emit failures never break the WebSocket flow) task_id = str(uuid.uuid4()) event_queue: EventQueue | None = getattr(websocket.app.state, "event_queue", None) task_store: InMemoryTaskStore | None = getattr(websocket.app.state, "task_store", None) await _emit_event_safe( event_queue, TaskEventType.TASK_CREATED, task_id=task_id, session_id=conv.id, data={"message": message_text}, ) # Add user message to conversation await _conversation_store.add_message(conv.id, "user", message_text) start_time = datetime.now(timezone.utc) async def _record_experience( task_type: str, goal: str, outcome: str, duration_seconds: float ) -> None: """Record experience to dashboard after chat completion.""" try: exp = DashboardExperience( id=str(uuid.uuid4()), task_type=task_type, goal=goal[:200], outcome=outcome, duration_seconds=duration_seconds, created_at=datetime.now(timezone.utc), ) _dashboard_experiences.append(exp) await _broadcast_dashboard_event( "experience_added", { "id": exp.id, "task_type": exp.task_type, "goal": exp.goal, "outcome": exp.outcome, }, ) await _broadcast_dashboard_event("metrics_updated", {"period": "7d"}) except ( asyncio.QueueFull, RuntimeError, ConnectionError, ValueError, KeyError, ) as e: logger.warning(f"Failed to record experience: {e}") # Unified preprocessing via RequestPreprocessor (minimal: @skill prefix + greeting regex + REACT) pool = websocket.app.state.agent_pool skill_registry = websocket.app.state.skill_registry llm_gateway = websocket.app.state.llm_gateway request_preprocessor: RequestPreprocessor = websocket.app.state.request_preprocessor all_skills = skill_registry.list_skills() # Get default tools for RequestPreprocessor default_tools = [] default_system_prompt = None default_agent = pool.get_agent("default") if default_agent is not None: default_tools = default_agent.get_tools() default_system_prompt = ( getattr(default_agent, "_system_prompt", None) or default_agent.get_system_prompt() ) else: for skill in all_skills: agent = pool.get_agent(skill.name) if agent is not None: default_tools = agent.get_tools() default_system_prompt = ( getattr(agent, "_system_prompt", None) or agent.get_system_prompt() ) break # Preprocess via RequestPreprocessor (minimal: @skill prefix + greeting regex + REACT) routing_result = await request_preprocessor.preprocess( content=message_text, skill_registry=skill_registry, default_tools=default_tools, default_system_prompt=default_system_prompt, default_model=model_override or "default", default_agent_name="default", ) await websocket.send_json( { "type": "routing", "skill": routing_result.agent_name or "default", "method": routing_result.match_method or "intent", "confidence": routing_result.match_confidence, } ) # Emit task.started to EQ (execution begins after routing) await _emit_event_safe( event_queue, TaskEventType.TASK_STARTED, task_id=task_id, session_id=conv.id, data={ "agent_name": routing_result.agent_name or "default", "execution_mode": routing_result.execution_mode.value if hasattr(routing_result.execution_mode, "value") else str(routing_result.execution_mode), }, ) # Register task in TaskStore for status tracking and recovery if task_store is not None: try: await _task_store_create( task_store, task_id=task_id, agent_name=routing_result.agent_name or "default", input_data={"message": message_text}, skill_name=routing_result.skill_name, ) # Store conversation_id in metadata for frontend recovery await _task_store_update_status( task_store, task_id, TaskStatus.PENDING, metadata={"conversation_id": conv.id}, ) except ( ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError, ): logger.warning("Failed to register task in TaskStore", exc_info=True) # Execute based on routing result's execution_mode # This is the single source of truth for path selection, # replacing fragile string-matching on match_method. if routing_result.execution_mode == ExecutionMode.DIRECT_CHAT: # Zero-cost path: direct LLM call, no ReAct loop chat_messages = [] # Inject system prompt (contains SOUL/USER/MEMORY/DAILY) for identity continuity if routing_result.system_prompt: chat_messages.append( {"role": "system", "content": routing_result.system_prompt} ) chat_messages.append({"role": "user", "content": message_text}) # Inject conversation history for context continuity history_msgs = await _build_history_messages(conv.id) for hm in history_msgs: chat_messages.insert(-1, hm) response = await llm_gateway.chat( messages=chat_messages, model=model_override or "default", agent_name="default", task_type="chat", ) # Store assistant reply for multi-turn context continuity response_content = _ensure_non_empty(response.content) await _conversation_store.add_message(conv.id, "assistant", response_content) # Update TaskStore status to COMPLETED if task_store is not None: try: await _task_store_update_status( task_store, task_id, TaskStatus.COMPLETED, output_data={"output": response_content}, completed_at=datetime.now(timezone.utc), progress=1.0, progress_message="Completed", ) except ( ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError, ): logger.warning("Failed to update TaskStore for DIRECT_CHAT", exc_info=True) # Emit turn.final_answer and task.completed to EQ await _emit_event_safe( event_queue, TurnEventType.FINAL_ANSWER, task_id=task_id, session_id=conv.id, data={"output": response_content}, ) await _emit_event_safe( event_queue, TaskEventType.TASK_COMPLETED, task_id=task_id, session_id=conv.id, data={"output": response_content}, ) await websocket.send_json( { "type": "result", "data": { "message": response_content, "timestamp": datetime.now(timezone.utc).isoformat(), }, } ) await _record_experience( "chat", message_text, "success", (datetime.now(timezone.utc) - start_time).total_seconds(), ) continue # REACT / SKILL_REACT / REWOO / REFLEXION / PLAN_EXEC / TEAM_COLLAB # Advanced modes fall back to REACT with a warning. if routing_result.execution_mode not in ( ExecutionMode.REACT, ExecutionMode.SKILL_REACT, ): logger.warning( f"Execution mode {routing_result.execution_mode.value} not yet supported " f"in portal WebSocket, falling back to REACT" ) agent_name = routing_result.agent_name or "default" agent = pool.get_agent(agent_name) if agent is None: # Agent not in pool — fall back to direct chat. # This handles the case where routing returned an agent_name # that doesn't exist in the pool (e.g. "default" or a # skill that hasn't been instantiated yet). logger.info( f"Session {conv.id}: agent '{agent_name}' not in pool, falling back to direct chat" ) chat_messages = [] # Inject system prompt (contains SOUL/USER/MEMORY/DAILY) for identity continuity if routing_result.system_prompt: chat_messages.append( {"role": "system", "content": routing_result.system_prompt} ) chat_messages.append({"role": "user", "content": message_text}) try: history = await _conversation_store.get_history(conv.id, limit=20) for hist_msg in history[:-1]: if hist_msg.role in ("user", "assistant"): chat_messages.insert( -1, {"role": hist_msg.role, "content": hist_msg.content} ) except ( ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError, ): pass response = await llm_gateway.chat( messages=chat_messages, model=model_override or "default", agent_name="default", task_type="chat", ) # Store assistant reply for multi-turn context continuity response_content = _ensure_non_empty(response.content) await _conversation_store.add_message(conv.id, "assistant", response_content) # Emit turn.final_answer and task.completed to EQ (fallback path) await _emit_event_safe( event_queue, TurnEventType.FINAL_ANSWER, task_id=task_id, session_id=conv.id, data={"output": response_content}, ) await _emit_event_safe( event_queue, TaskEventType.TASK_COMPLETED, task_id=task_id, session_id=conv.id, data={"output": response_content}, ) await websocket.send_json( { "type": "result", "data": { "status": "completed", "content": response_content, "timestamp": datetime.now(timezone.utc).isoformat(), }, } ) await _record_experience( "chat", message_text, "success", (datetime.now(timezone.utc) - start_time).total_seconds(), ) continue # Execute via ReAct stream react_config = agent.get_react_config() # Reuse agent's ReActEngine if available (aligned with chat.py pattern) react_engine = getattr(agent, "_react_engine", None) if react_engine is None: react_engine = ReActEngine( llm_gateway=llm_gateway, max_steps=react_config["max_steps"], ) else: react_engine.reset() messages = [{"role": "user", "content": message_text}] # Inject conversation history for context continuity history_msgs = await _build_history_messages(conv.id) for hm in reversed(history_msgs): messages.insert(0, hm) tools = agent.get_tools() model = model_override or agent.get_model() system_prompt = getattr(agent, "_system_prompt", None) or agent.get_system_prompt() timeout_seconds = react_config["timeout_seconds"] logger.info( f"[portal] agent='{agent_name}' tools={len(tools)} " f"[{', '.join(t.name for t in tools)}] model={model}" ) # Start ReAct execution as a background task, decoupled from # WebSocket lifecycle. When the WebSocket disconnects, the # background task continues running and persists the result. bg_task = asyncio.create_task( _execute_react_background( react_engine=react_engine, messages=messages, tools=tools, model=model, agent_name=agent.name, system_prompt=system_prompt, timeout_seconds=timeout_seconds, conv_id=conv.id, task_id=task_id, event_queue=event_queue, conversation_store=_conversation_store, task_store=task_store, ) ) _running_background_tasks.add(bg_task) bg_task.add_done_callback(_running_background_tasks.discard) active_bg_task = bg_task # C1 guard: EventQueue is required for subscribe; fall back to # awaiting the background task directly if unavailable. if event_queue is None: logger.warning("EventQueue not configured; awaiting background task directly") try: await bg_task except (RuntimeError, ConnectionError, asyncio.TimeoutError): pass # errors handled inside _execute_react_background active_bg_task = None continue # Subscribe to EventQueue (filtered by task_id) and forward # events to the WebSocket. When the WebSocket disconnects, # this loop exits but the background task continues. # P1 #7 fix: bound the subscribe loop with a timeout so a # hung background task cannot block the WebSocket forever. # Matches the resume path's timeout strategy. _subscribe_timeout = _WS_HEARTBEAT_TIMEOUT * 10 if _WS_HEARTBEAT_TIMEOUT > 0 else 600 try: async with asyncio.timeout(_subscribe_timeout): async for event in event_queue.subscribe(task_id=task_id): if event.event_type == TaskEventType.TASK_COMPLETED: response_text = event.data.get("output", EMPTY_LLM_RESPONSE) await websocket.send_json( { "type": "result", "data": { "message": response_text, "timestamp": event.data.get( "timestamp", datetime.now(timezone.utc).isoformat(), ), }, } ) await _record_experience( routing_result.skill_name or "agent", message_text, "success" if response_text != EMPTY_LLM_RESPONSE else "failure", (datetime.now(timezone.utc) - start_time).total_seconds(), ) break elif event.event_type == TaskEventType.TASK_FAILED: await websocket.send_json( { "type": "error", "data": { "message": event.data.get("error", "Unknown error"), }, } ) await _record_experience( routing_result.skill_name or "agent", message_text, "failure", (datetime.now(timezone.utc) - start_time).total_seconds(), ) break else: # Forward ReAct events as step messages. # P1 #8/#10 fix: step and data are now top-level # fields in event.data (no longer nested). await websocket.send_json( { "type": "step", "data": { "event_type": event.event_type, "step": event.data.get("step", 0), "data": { k: v for k, v in event.data.items() if k not in ("step", "timestamp") }, "timestamp": event.data.get("timestamp", ""), }, } ) except TimeoutError: logger.warning(f"Subscribe loop timed out for task {task_id}") if active_bg_task is not None and not active_bg_task.done(): active_bg_task.cancel() await websocket.send_json( { "type": "error", "data": { "message": "Task timed out. Please retry your request.", "task_id": task_id, }, } ) except RuntimeError as exc: # P1 #5: subscriber limit reached or EQ closed — send # a friendly error instead of terminating the connection. logger.warning("Subscribe failed for task %s: %s", task_id, exc) await websocket.send_json( { "type": "error", "data": { "message": "Server busy, please retry shortly.", "task_id": task_id, }, } ) except WebSocketDisconnect: logger.debug(f"Portal WebSocket disconnected for conversation {conv.id if conv else 'N/A'}") # P0 fix: Do NOT cancel the background task on disconnect. # The entire purpose of the three-layer defense is to let the # background task continue running and persist the result so the # frontend can resume it after reconnection. Cancelling here would # kill the task, lose the full output, and mark it FAILED — # defeating layers 2 and 3. The task is only cancelled on explicit # user cancel (msg_type == 'cancel') or application shutdown. except asyncio.CancelledError: raise except Exception as e: logger.error(f"Portal WebSocket error: {e}") # P1 #6 fix: Do NOT cancel the background task on connection-level # errors (ConnectionResetError, BrokenPipeError, etc.). These are # functionally equivalent to WebSocketDisconnect — the client dropped # — and the background task must survive to persist its result. # Only cancel on truly unexpected errors that may have corrupted # state needed by the background task. if not isinstance(e, (ConnectionResetError, BrokenPipeError, ConnectionError)): if active_bg_task is not None and not active_bg_task.done(): active_bg_task.cancel() # Emit task.failed to EQ if a task was in progress # (task_id is set when a user message is received; None before that) if task_id is not None and conv is not None: event_queue = getattr(websocket.app.state, "event_queue", None) await _emit_event_safe( event_queue, TaskEventType.TASK_FAILED, task_id=task_id, session_id=conv.id, data={"error": str(e)}, ) try: await websocket.send_json({"type": "error", "data": {"message": str(e)}}) except (ConnectionError, RuntimeError, asyncio.TimeoutError): pass finally: # Remove from user-scoped push tracking on any disconnect/error/return. if ws_user_id: portal_connection_manager.remove(ws_user_id, websocket)