1985 lines
81 KiB
Python
1985 lines
81 KiB
Python
import asyncio
|
|
import hmac
|
|
import json
|
|
import re
|
|
import logging
|
|
import os
|
|
import uuid
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
from fastapi import (
|
|
APIRouter,
|
|
Depends,
|
|
HTTPException,
|
|
Request,
|
|
WebSocket,
|
|
WebSocketDisconnect,
|
|
Security,
|
|
)
|
|
from fastapi.security import APIKeyHeader, APIKeyQuery
|
|
from pydantic import BaseModel
|
|
|
|
from agentkit.core.config_driven import ConfigDrivenAgent
|
|
from agentkit.core.event_queue import EventQueue
|
|
from agentkit.core.protocol import Event, TaskEventType, TaskStatus, TurnEventType
|
|
from agentkit.core.react import ReActEngine
|
|
from agentkit.chat.skill_routing import ExecutionMode, SkillRoutingResult
|
|
from agentkit.chat.request_preprocessor import RequestPreprocessor
|
|
from agentkit.server.routes.evolution_dashboard import (
|
|
_experiences as _dashboard_experiences,
|
|
DashboardExperience,
|
|
_broadcast_event as _broadcast_dashboard_event,
|
|
)
|
|
from agentkit.core.fallback import EMPTY_LLM_RESPONSE
|
|
from agentkit.chat.sqlite_conversation_store import SqliteConversationStore
|
|
from agentkit.server.task_store import InMemoryTaskStore
|
|
from agentkit.session.models import MessageRole
|
|
|
|
# ponytail: importing module-private helpers from chat.py because the frontend
|
|
# WS connects to /api/v1/portal/ws (this router), not /api/v1/chat/ws/{session_id}.
|
|
# Without this, @board/@team prefixes are never intercepted and board/team cards
|
|
# never render. Upgrade path: extract these into a shared experts dispatch module.
|
|
from agentkit.server.routes.chat import _execute_board_meeting, _execute_team_collab
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
router = APIRouter(tags=["portal"])
|
|
|
|
# Use a project-local SQLite file to avoid read-only sandbox restrictions on ~/.agentkit.
|
|
_PROJECT_ROOT = Path(__file__).parents[4]
|
|
_CONVERSATIONS_DB_PATH = Path(
|
|
os.environ.get("AGENTKIT_CONVERSATIONS_DB", _PROJECT_ROOT / "data" / "conversations.db")
|
|
)
|
|
_CONVERSATIONS_DB_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Track background ReAct tasks so they are not garbage-collected mid-execution.
|
|
# Tasks are removed automatically via add_done_callback when they complete.
|
|
_running_background_tasks: set[asyncio.Task] = set()
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# API Key Authentication
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
|
|
_api_key_query = APIKeyQuery(name="api_key", auto_error=False)
|
|
|
|
|
|
def _ensure_non_empty(text: str | None) -> str:
|
|
"""Ensure response text is never empty or whitespace-only."""
|
|
if text and text.strip():
|
|
return text
|
|
return EMPTY_LLM_RESPONSE
|
|
|
|
|
|
async def _emit_event_safe(
|
|
event_queue: EventQueue | None,
|
|
event_type: str,
|
|
task_id: str,
|
|
session_id: str,
|
|
data: dict | None = None,
|
|
) -> None:
|
|
"""Emit an event to the EventQueue without blocking or raising.
|
|
|
|
The EQ is a side-channel: emit failures must never break the WebSocket flow.
|
|
All exceptions are swallowed and logged at warning level.
|
|
|
|
Args:
|
|
event_queue: The EventQueue to emit to (no-op if None)
|
|
event_type: Event type (see TaskEventType / TurnEventType)
|
|
task_id: Associated task ID
|
|
session_id: Associated session ID (conversation_id)
|
|
data: Optional event payload
|
|
"""
|
|
if event_queue is None:
|
|
return
|
|
try:
|
|
event = Event.create(
|
|
event_type=event_type,
|
|
task_id=task_id,
|
|
session_id=session_id,
|
|
data=data or {},
|
|
)
|
|
await event_queue.emit(event)
|
|
except (asyncio.QueueFull, RuntimeError, ConnectionError) as e:
|
|
logger.warning(f"EventQueue emit failed (type={event_type}): {e}", exc_info=True)
|
|
|
|
|
|
# P1 #14 fix: TaskStore sync/async compatibility shim.
|
|
# InMemoryTaskStore methods are sync; RedisTaskStore methods are async.
|
|
# These helpers detect and await coroutines so portal.py works with both.
|
|
async def _task_store_create(store, *args, **kwargs):
|
|
result = store.create(*args, **kwargs)
|
|
if asyncio.iscoroutine(result):
|
|
return await result
|
|
return result
|
|
|
|
|
|
async def _task_store_get(store, *args, **kwargs):
|
|
result = store.get(*args, **kwargs)
|
|
if asyncio.iscoroutine(result):
|
|
return await result
|
|
return result
|
|
|
|
|
|
async def _task_store_update_status(store, *args, **kwargs):
|
|
result = store.update_status(*args, **kwargs)
|
|
if asyncio.iscoroutine(result):
|
|
return await result
|
|
return result
|
|
|
|
|
|
async def _task_store_list_tasks(store, *args, **kwargs):
|
|
result = store.list_tasks(*args, **kwargs)
|
|
if asyncio.iscoroutine(result):
|
|
return await result
|
|
return result
|
|
|
|
|
|
async def _verify_api_key(
|
|
request: Request,
|
|
api_key_header: str | None = Security(_api_key_header),
|
|
api_key_query: str | None = Security(_api_key_query),
|
|
) -> None:
|
|
"""Verify API key for REST endpoints. Raises HTTPException if invalid."""
|
|
configured_api_key: str | None = None
|
|
if hasattr(request.app.state, "server_config") and request.app.state.server_config:
|
|
configured_api_key = request.app.state.server_config.api_key
|
|
if configured_api_key is None and hasattr(request.app.state, "api_key"):
|
|
configured_api_key = request.app.state.api_key
|
|
|
|
# If no API key is configured, allow all requests (backwards compat)
|
|
if configured_api_key is None:
|
|
return
|
|
|
|
provided = api_key_header or api_key_query
|
|
if not hmac.compare_digest((provided or "").encode(), configured_api_key.encode()):
|
|
raise HTTPException(
|
|
status_code=401,
|
|
detail="Invalid or missing API key. Provide via X-API-Key header or api_key query parameter.",
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# In-memory Conversation Store
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@dataclass
|
|
class ChatMessage:
|
|
role: str # "user" or "assistant"
|
|
content: str
|
|
timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
metadata: dict = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class Conversation:
|
|
id: str
|
|
messages: list[ChatMessage] = field(default_factory=list)
|
|
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
updated_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
|
|
|
|
|
# Heartbeat timeout in seconds — 0 disables timeout (for testing)
|
|
_WS_HEARTBEAT_TIMEOUT = float(os.environ.get("AGENTKIT_WS_TIMEOUT", "120"))
|
|
_conversation_store = SqliteConversationStore(db_path=_CONVERSATIONS_DB_PATH)
|
|
|
|
|
|
class _ConvStoreAsSessionManager:
|
|
"""Adapt SqliteConversationStore to the SessionManager.append_message shape
|
|
used by chat.py's _execute_board_meeting / _execute_team_collab.
|
|
|
|
ponytail: only append_message is implemented — that's all the board/team
|
|
intercepts call. If future logic needs more SessionManager methods, add
|
|
them here then.
|
|
"""
|
|
|
|
def __init__(self, store: SqliteConversationStore) -> None:
|
|
self._store = store
|
|
|
|
async def append_message(
|
|
self,
|
|
session_id: str,
|
|
role: MessageRole,
|
|
content: str,
|
|
tool_call_id: str | None = None,
|
|
agent_name: str | None = None,
|
|
metadata: dict[str, object] | None = None,
|
|
) -> None:
|
|
role_str = role.value if hasattr(role, "value") else str(role)
|
|
await self._store.add_message(session_id, role_str, content, metadata)
|
|
|
|
|
|
_sm_adapter = _ConvStoreAsSessionManager(_conversation_store)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Active portal WebSocket connections by user_id
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class PortalConnectionManager:
|
|
"""Track active portal WebSocket connections by authenticated user_id.
|
|
|
|
Used by the calendar reminder scheduler (and other user-scoped push
|
|
features) to deliver real-time messages to a user's open chat tab(s).
|
|
"""
|
|
|
|
# ponytail: per-user connection cap prevents a single client from
|
|
# exhausting memory via unbounded WS spawns. 16 covers typical
|
|
# multi-tab usage. Upgrade path: make configurable via server_config.
|
|
_MAX_CONNECTIONS_PER_USER = 16
|
|
|
|
def __init__(self) -> None:
|
|
# user_id -> list of active WebSocket connections
|
|
self._connections: dict[str, list[WebSocket]] = {}
|
|
|
|
def add(self, user_id: str, ws: WebSocket) -> None:
|
|
conns = self._connections.setdefault(user_id, [])
|
|
if len(conns) >= self._MAX_CONNECTIONS_PER_USER:
|
|
# Close the oldest connection to make room (FIFO eviction).
|
|
oldest = conns.pop(0)
|
|
try:
|
|
# Best-effort close; ignore failures since the socket may
|
|
# already be dead.
|
|
import asyncio
|
|
|
|
asyncio.create_task(oldest.close(code=1008, reason="Connection limit exceeded"))
|
|
except (ConnectionError, RuntimeError):
|
|
pass
|
|
conns.append(ws)
|
|
|
|
def remove(self, user_id: str, ws: WebSocket) -> None:
|
|
conns = self._connections.get(user_id)
|
|
if conns is None:
|
|
return
|
|
self._connections[user_id] = [w for w in conns if w is not ws]
|
|
if not self._connections[user_id]:
|
|
del self._connections[user_id]
|
|
|
|
async def send_json(self, user_id: str, message: dict[str, object]) -> None:
|
|
"""Broadcast a JSON message to all connections for *user_id*.
|
|
|
|
Removes stale connections that fail to send.
|
|
"""
|
|
conns = list(self._connections.get(user_id, []))
|
|
if not conns:
|
|
return
|
|
stale: list[WebSocket] = []
|
|
for ws in conns:
|
|
try:
|
|
await ws.send_json(message)
|
|
except (ConnectionError, RuntimeError, asyncio.TimeoutError) as e:
|
|
logger.debug("Portal WS send failed for user %s (marking stale): %s", user_id, e)
|
|
stale.append(ws)
|
|
for ws in stale:
|
|
self.remove(user_id, ws)
|
|
|
|
|
|
portal_connection_manager = PortalConnectionManager()
|
|
|
|
|
|
async def send_to_user(user_id: str, message: dict[str, object]) -> None:
|
|
"""Public helper to push a message to all portal WebSockets for a user."""
|
|
await portal_connection_manager.send_json(user_id, message)
|
|
|
|
|
|
# P1 #9 fix: ReAct event type -> TurnEventType mapping for EQ subscribers.
|
|
# Preserves the original EQ contract so CLI and other subscribers that
|
|
# filter on TurnEventType constants (e.g. 'turn.thinking') keep working.
|
|
_REACT_EVENT_TYPE_MAP: dict[str, str] = {
|
|
"thinking": TurnEventType.THINKING,
|
|
"tool_call": TurnEventType.TOOL_CALL,
|
|
"tool_result": TurnEventType.TOOL_RESULT,
|
|
"token": TurnEventType.TOKEN,
|
|
"final_answer": TurnEventType.FINAL_ANSWER,
|
|
"error": TurnEventType.TURN_COMPLETED, # best-effort mapping
|
|
"confirmation_request": TurnEventType.STEP,
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# History injection helper — configurable limit + optional compression
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Maximum history messages to inject (can be overridden by server config)
|
|
_MAX_HISTORY_MESSAGES = 50
|
|
|
|
|
|
async def _build_history_messages(
|
|
conv_id: str,
|
|
limit: int = _MAX_HISTORY_MESSAGES,
|
|
) -> list[dict]:
|
|
"""Build conversation history messages for LLM context injection.
|
|
|
|
Returns a list of {"role": "user"|"assistant", "content": ...} dicts
|
|
representing the conversation history (excluding the current user message,
|
|
which should be appended separately by the caller).
|
|
"""
|
|
try:
|
|
history = await _conversation_store.get_history(conv_id, limit=limit)
|
|
except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError):
|
|
return []
|
|
|
|
# The last message in history is the current user message (just added),
|
|
# so skip it to avoid duplication.
|
|
messages = []
|
|
for hist_msg in history[:-1]:
|
|
if hist_msg.role in ("user", "assistant"):
|
|
messages.append({"role": hist_msg.role, "content": hist_msg.content})
|
|
return messages
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Capability mapping
|
|
# ---------------------------------------------------------------------------
|
|
|
|
CAPABILITY_CATEGORIES: dict[str, dict[str, str]] = {
|
|
"chat": {
|
|
"display_name": "智能对话",
|
|
"description": "自然语言交互,自动路由到对应能力",
|
|
"icon": "MessageOutlined",
|
|
},
|
|
"workflow": {
|
|
"display_name": "工作流编排",
|
|
"description": "可视化拖拽编排工作流",
|
|
"icon": "ApartmentOutlined",
|
|
},
|
|
"knowledge": {
|
|
"display_name": "知识库",
|
|
"description": "文档摄取、语义检索、多源RAG",
|
|
"icon": "BookOutlined",
|
|
},
|
|
"skills": {
|
|
"display_name": "技能管理",
|
|
"description": "浏览和管理已注册的技能",
|
|
"icon": "AppstoreOutlined",
|
|
},
|
|
"terminal": {
|
|
"display_name": "智能终端",
|
|
"description": "交互式终端会话和命令执行",
|
|
"icon": "CodeOutlined",
|
|
},
|
|
"computer_use": {
|
|
"display_name": "Computer Use",
|
|
"description": "UI自动化操作和截屏识别",
|
|
"icon": "DesktopOutlined",
|
|
},
|
|
"evolution": {
|
|
"display_name": "自进化",
|
|
"description": "经验积累、避坑预警、路径优化",
|
|
"icon": "RiseOutlined",
|
|
},
|
|
"settings": {
|
|
"display_name": "系统设置",
|
|
"description": "配置LLM、技能、知识库连接",
|
|
"icon": "SettingOutlined",
|
|
},
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Request / Response models
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class ChatRequest(BaseModel):
|
|
message: str
|
|
conversation_id: str | None = None
|
|
sources: list[str] | None = None
|
|
skill_name: str | None = None
|
|
|
|
|
|
class ChatResponse(BaseModel):
|
|
conversation_id: str
|
|
message: str
|
|
timestamp: str
|
|
matched_skill: str | None = None
|
|
routing_method: str | None = None
|
|
confidence: float | None = None
|
|
task_id: str | None = None
|
|
status: str = "completed"
|
|
|
|
|
|
class CapabilityInfo(BaseModel):
|
|
name: str
|
|
display_name: str
|
|
description: str
|
|
icon: str
|
|
enabled: bool
|
|
skill_count: int
|
|
|
|
|
|
class CapabilitiesResponse(BaseModel):
|
|
capabilities: list[CapabilityInfo]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helper: resolve agent + skill for a chat request
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def _resolve_for_chat(
|
|
request: ChatRequest, req: Request
|
|
) -> tuple[
|
|
ConfigDrivenAgent | None, SkillRoutingResult | None, str | None, str | None, float | None
|
|
]:
|
|
"""Resolve agent and routing for a chat request via RequestPreprocessor.
|
|
|
|
Returns (agent, routing_result, matched_skill_name, routing_method, confidence).
|
|
"""
|
|
pool = req.app.state.agent_pool
|
|
skill_registry = req.app.state.skill_registry
|
|
request_preprocessor: RequestPreprocessor = req.app.state.request_preprocessor
|
|
|
|
matched_skill_name: str | None = None
|
|
routing_method: str | None = None
|
|
confidence: float | None = None
|
|
|
|
# Get default tools and system prompt
|
|
default_tools = []
|
|
default_system_prompt = None
|
|
default_agent = pool.get_agent("default")
|
|
if default_agent is not None:
|
|
default_tools = default_agent.get_tools()
|
|
default_system_prompt = (
|
|
getattr(default_agent, "_system_prompt", None) or default_agent.get_system_prompt()
|
|
)
|
|
else:
|
|
all_skills = skill_registry.list_skills()
|
|
for skill in all_skills:
|
|
agent = pool.get_agent(skill.name)
|
|
if agent is not None:
|
|
default_tools = agent.get_tools()
|
|
default_system_prompt = (
|
|
getattr(agent, "_system_prompt", None) or agent.get_system_prompt()
|
|
)
|
|
break
|
|
|
|
# If skill_name is explicitly provided in the request, use it directly
|
|
if request.skill_name:
|
|
routing_result = await request_preprocessor.preprocess(
|
|
content=f"@skill:{request.skill_name} {request.message}",
|
|
skill_registry=skill_registry,
|
|
default_tools=default_tools,
|
|
default_system_prompt=default_system_prompt,
|
|
default_model="default",
|
|
default_agent_name="default",
|
|
)
|
|
else:
|
|
# Preprocess via RequestPreprocessor (minimal: @skill prefix + greeting regex + REACT)
|
|
routing_result = await request_preprocessor.preprocess(
|
|
content=request.message,
|
|
skill_registry=skill_registry,
|
|
default_tools=default_tools,
|
|
default_system_prompt=default_system_prompt,
|
|
default_model="default",
|
|
default_agent_name="default",
|
|
)
|
|
|
|
matched_skill_name = routing_result.skill_name or routing_result.agent_name
|
|
routing_method = routing_result.match_method
|
|
confidence = routing_result.match_confidence
|
|
|
|
# Get or create agent based on routing result
|
|
if routing_result.matched and routing_result.skill_name:
|
|
agent = pool.get_agent(routing_result.skill_name)
|
|
if agent is None:
|
|
agent = await pool.create_agent_from_skill(routing_result.skill_name)
|
|
else:
|
|
agent = pool.get_agent("default")
|
|
if agent is None:
|
|
# Fallback: try to create from first available skill
|
|
all_skills = skill_registry.list_skills()
|
|
if all_skills:
|
|
agent = await pool.create_agent_from_skill(all_skills[0].name)
|
|
|
|
return agent, routing_result, matched_skill_name, routing_method, confidence
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Endpoints
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@router.post("/portal/chat", response_model=ChatResponse)
|
|
async def chat(request: ChatRequest, req: Request, _auth: None = Depends(_verify_api_key)):
|
|
"""Send a chat message and get a response with RequestPreprocessor routing."""
|
|
# If skill_name is explicitly requested but not found, return 404
|
|
if request.skill_name:
|
|
skill_registry = req.app.state.skill_registry
|
|
if not skill_registry.has_skill(request.skill_name):
|
|
raise HTTPException(status_code=404, detail=f"Skill '{request.skill_name}' not found")
|
|
|
|
agent, routing_result, matched_skill, routing_method, confidence = await _resolve_for_chat(
|
|
request, req
|
|
)
|
|
|
|
# Create or reuse conversation
|
|
conv = await _conversation_store.get_or_create(request.conversation_id)
|
|
await _conversation_store.add_message(conv.id, "user", request.message)
|
|
|
|
llm_gateway = req.app.state.llm_gateway
|
|
|
|
task_id = str(uuid.uuid4())
|
|
response_text = ""
|
|
|
|
if routing_result is not None and routing_result.execution_mode == ExecutionMode.DIRECT_CHAT:
|
|
# DIRECT_CHAT: direct LLM call, no ReAct loop (same as WebSocket path)
|
|
chat_messages = []
|
|
if routing_result.system_prompt:
|
|
chat_messages.append({"role": "system", "content": routing_result.system_prompt})
|
|
chat_messages.append({"role": "user", "content": request.message})
|
|
# Inject conversation history
|
|
history_msgs = await _build_history_messages(conv.id)
|
|
for hm in history_msgs:
|
|
chat_messages.insert(-1, hm)
|
|
response = await llm_gateway.chat(
|
|
messages=chat_messages,
|
|
model=routing_result.model or "default",
|
|
agent_name="default",
|
|
task_type="chat",
|
|
)
|
|
response_text = _ensure_non_empty(response.content)
|
|
else:
|
|
# REACT / SKILL_REACT / REWOO / REFLEXION / PLAN_EXEC / TEAM_COLLAB
|
|
# Advanced modes (REWOO, REFLEXION, PLAN_EXEC, TEAM_COLLAB) currently
|
|
# fall back to REACT with a warning. Full integration is tracked separately.
|
|
if routing_result is not None and routing_result.execution_mode not in (
|
|
ExecutionMode.REACT,
|
|
ExecutionMode.SKILL_REACT,
|
|
):
|
|
logger.warning(
|
|
f"Execution mode {routing_result.execution_mode.value} not yet supported "
|
|
f"in portal REST, falling back to REACT"
|
|
)
|
|
|
|
react_config = agent.get_react_config()
|
|
react_engine = getattr(agent, "_react_engine", None)
|
|
if react_engine is None:
|
|
react_engine = ReActEngine(
|
|
llm_gateway=llm_gateway,
|
|
max_steps=react_config["max_steps"],
|
|
)
|
|
else:
|
|
react_engine.reset()
|
|
|
|
messages = [{"role": "user", "content": request.message}]
|
|
# Inject conversation history
|
|
history_msgs = await _build_history_messages(conv.id)
|
|
for hm in reversed(history_msgs):
|
|
messages.insert(0, hm)
|
|
tools = agent.get_tools()
|
|
model = agent.get_model()
|
|
system_prompt = getattr(agent, "_system_prompt", None) or agent.get_system_prompt()
|
|
timeout_seconds = react_config["timeout_seconds"]
|
|
|
|
collected_output: list[str] = []
|
|
try:
|
|
async for event in react_engine.execute_stream(
|
|
messages=messages,
|
|
tools=tools,
|
|
model=model,
|
|
agent_name=agent.name,
|
|
system_prompt=system_prompt,
|
|
timeout_seconds=timeout_seconds,
|
|
):
|
|
if event.event_type == "final_answer":
|
|
collected_output.append(event.data.get("output", ""))
|
|
except asyncio.CancelledError:
|
|
raise
|
|
except Exception as e:
|
|
response_text = f"执行出错: {e}"
|
|
else:
|
|
response_text = _ensure_non_empty(
|
|
"".join(collected_output) if collected_output else None
|
|
)
|
|
|
|
await _conversation_store.add_message(conv.id, "assistant", response_text)
|
|
|
|
return ChatResponse(
|
|
conversation_id=conv.id,
|
|
message=response_text,
|
|
timestamp=datetime.now(timezone.utc).isoformat(),
|
|
matched_skill=matched_skill,
|
|
routing_method=routing_method,
|
|
confidence=confidence,
|
|
task_id=task_id,
|
|
status="completed",
|
|
)
|
|
|
|
|
|
@router.post("/portal/chat/stream")
|
|
async def chat_stream(request: ChatRequest, req: Request, _auth: None = Depends(_verify_api_key)):
|
|
"""Stream chat responses via SSE with RequestPreprocessor routing."""
|
|
from sse_starlette.sse import EventSourceResponse
|
|
|
|
agent, routing_result, matched_skill, routing_method, confidence = await _resolve_for_chat(
|
|
request, req
|
|
)
|
|
|
|
# Create or reuse conversation
|
|
conv = await _conversation_store.get_or_create(request.conversation_id)
|
|
await _conversation_store.add_message(conv.id, "user", request.message)
|
|
|
|
llm_gateway = req.app.state.llm_gateway
|
|
|
|
async def event_generator():
|
|
# Send routing info as first event
|
|
yield {
|
|
"event": "routing",
|
|
"data": json.dumps(
|
|
{
|
|
"skill": matched_skill,
|
|
"method": routing_method,
|
|
"confidence": confidence,
|
|
}
|
|
),
|
|
}
|
|
|
|
if (
|
|
routing_result is not None
|
|
and routing_result.execution_mode == ExecutionMode.DIRECT_CHAT
|
|
):
|
|
# DIRECT_CHAT: direct LLM call, no ReAct loop
|
|
chat_messages = []
|
|
if routing_result.system_prompt:
|
|
chat_messages.append({"role": "system", "content": routing_result.system_prompt})
|
|
chat_messages.append({"role": "user", "content": request.message})
|
|
history_msgs = await _build_history_messages(conv.id)
|
|
for hm in history_msgs:
|
|
chat_messages.insert(-1, hm)
|
|
response = await llm_gateway.chat(
|
|
messages=chat_messages,
|
|
model=routing_result.model or "default",
|
|
agent_name="default",
|
|
task_type="chat",
|
|
)
|
|
response_text = _ensure_non_empty(response.content)
|
|
await _conversation_store.add_message(conv.id, "assistant", response_text)
|
|
yield {
|
|
"event": "final_answer",
|
|
"data": json.dumps(
|
|
{
|
|
"step": 0,
|
|
"data": {"output": response_text},
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
),
|
|
}
|
|
else:
|
|
# REACT / SKILL_REACT / REWOO / REFLEXION / PLAN_EXEC / TEAM_COLLAB
|
|
# Advanced modes fall back to REACT with a warning.
|
|
if routing_result is not None and routing_result.execution_mode not in (
|
|
ExecutionMode.REACT,
|
|
ExecutionMode.SKILL_REACT,
|
|
):
|
|
logger.warning(
|
|
f"Execution mode {routing_result.execution_mode.value} not yet supported "
|
|
f"in portal SSE, falling back to REACT"
|
|
)
|
|
|
|
react_config = agent.get_react_config()
|
|
react_engine = getattr(agent, "_react_engine", None)
|
|
if react_engine is None:
|
|
react_engine = ReActEngine(
|
|
llm_gateway=llm_gateway,
|
|
max_steps=react_config["max_steps"],
|
|
)
|
|
else:
|
|
react_engine.reset()
|
|
|
|
messages = [{"role": "user", "content": request.message}]
|
|
tools = agent.get_tools()
|
|
model = agent.get_model()
|
|
system_prompt = getattr(agent, "_system_prompt", None) or agent.get_system_prompt()
|
|
timeout_seconds = react_config["timeout_seconds"]
|
|
|
|
collected_output: list[str] = []
|
|
try:
|
|
async for event in react_engine.execute_stream(
|
|
messages=messages,
|
|
tools=tools,
|
|
model=model,
|
|
agent_name=agent.name,
|
|
system_prompt=system_prompt,
|
|
timeout_seconds=timeout_seconds,
|
|
):
|
|
if event.event_type == "final_answer":
|
|
collected_output.append(event.data.get("output", ""))
|
|
yield {
|
|
"event": event.event_type,
|
|
"data": json.dumps(
|
|
{
|
|
"step": event.step,
|
|
"data": event.data,
|
|
"timestamp": event.timestamp,
|
|
}
|
|
),
|
|
}
|
|
except asyncio.CancelledError:
|
|
raise
|
|
except Exception as e:
|
|
yield {
|
|
"event": "error",
|
|
"data": json.dumps({"error": str(e)}),
|
|
}
|
|
return
|
|
|
|
response_text = _ensure_non_empty(
|
|
"".join(collected_output) if collected_output else None
|
|
)
|
|
await _conversation_store.add_message(conv.id, "assistant", response_text)
|
|
|
|
return EventSourceResponse(event_generator())
|
|
|
|
|
|
@router.get("/portal/capabilities", response_model=CapabilitiesResponse)
|
|
async def get_capabilities(req: Request, _auth: None = Depends(_verify_api_key)):
|
|
"""List all available capabilities with their status."""
|
|
skill_registry = req.app.state.skill_registry
|
|
all_skills = skill_registry.list_skills()
|
|
|
|
# Build a map of capability tag -> skill count
|
|
cap_skill_counts: dict[str, int] = {}
|
|
for skill in all_skills:
|
|
for cap in skill.capabilities:
|
|
cap_skill_counts[cap.tag] = cap_skill_counts.get(cap.tag, 0) + 1
|
|
# Also count the skill itself toward "skills" category
|
|
cap_skill_counts["skills"] = cap_skill_counts.get("skills", 0) + 1
|
|
|
|
capabilities: list[CapabilityInfo] = []
|
|
for cat_name, cat_info in CAPABILITY_CATEGORIES.items():
|
|
skill_count = cap_skill_counts.get(cat_name, 0)
|
|
capabilities.append(
|
|
CapabilityInfo(
|
|
name=cat_name,
|
|
display_name=cat_info["display_name"],
|
|
description=cat_info["description"],
|
|
icon=cat_info["icon"],
|
|
enabled=True,
|
|
skill_count=skill_count,
|
|
)
|
|
)
|
|
|
|
return CapabilitiesResponse(capabilities=capabilities)
|
|
|
|
|
|
@router.get("/portal/conversations")
|
|
async def list_conversations(limit: int = 20, _auth: None = Depends(_verify_api_key)):
|
|
"""List recent conversations.
|
|
|
|
For each conversation, derive the title from the first user message
|
|
read directly from SQLite (independent of the in-memory cache, which
|
|
may have an empty `messages` list after a restart). This prevents the
|
|
regression where titles collapse to the placeholder "对话".
|
|
|
|
Also tags each conversation with ``is_board`` so the sidebar can show
|
|
a "私董会" badge without having to fetch every conversation's full
|
|
history. The check is a cheap metadata LIKE query — the list
|
|
endpoint stays O(limit) even for hundreds of conversations.
|
|
"""
|
|
convs = await _conversation_store.list_conversations(limit=limit)
|
|
result: list[dict] = []
|
|
for c in convs:
|
|
# Re-derive title from the persisted user message so cache misses
|
|
# after a restart don't surface the default placeholder.
|
|
first_user = await _conversation_store.get_first_user_message(c.id)
|
|
title = _derive_conversation_title_from_content(first_user.content if first_user else None)
|
|
is_board = await _conversation_has_board_started(c.id)
|
|
result.append(
|
|
{
|
|
"id": c.id,
|
|
"title": title,
|
|
"created_at": c.created_at.isoformat(),
|
|
"updated_at": c.updated_at.isoformat(),
|
|
"message_count": len(c.messages),
|
|
"is_board": is_board,
|
|
}
|
|
)
|
|
return result
|
|
|
|
|
|
async def _conversation_has_board_started(conversation_id: str) -> bool:
|
|
"""Return True if the conversation contains a persisted board_started event.
|
|
|
|
Used by the sidebar list so it can render the "私董会" badge without
|
|
fetching the full message history. Reads from SQLite directly — the
|
|
in-memory cache may not be populated after a server restart.
|
|
|
|
Returns False on any storage error so the badge never blocks the
|
|
list endpoint.
|
|
"""
|
|
try:
|
|
return await _conversation_store.has_message_with_type(
|
|
conversation_id, "board_started"
|
|
)
|
|
except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError):
|
|
logger.warning("is_board lookup failed for %s", conversation_id, exc_info=True)
|
|
return False
|
|
|
|
|
|
def _derive_conversation_title(conv: Conversation) -> str:
|
|
"""Derive a human-readable title from the first user message in the conversation object."""
|
|
for msg in conv.messages:
|
|
if msg.role == "user" and msg.content:
|
|
return msg.content[:20] + ("..." if len(msg.content) > 20 else "")
|
|
return "对话"
|
|
|
|
|
|
_COMMAND_PREFIX_RE = re.compile(
|
|
r"^@(?:board|team)(?::[^\s]+)?(?:\s+rounds=\d+)?\s*",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def _strip_command_prefix(content: str) -> str:
|
|
"""Strip a leading @board/@team command so conversation titles show only the topic.
|
|
|
|
Examples:
|
|
"@board:warren,charlie 怎么看 AI" → "怎么看 AI"
|
|
"@team 私董会" → "私董会"
|
|
"@board rounds=3 软件行业" → "软件行业"
|
|
"""
|
|
if not content:
|
|
return ""
|
|
return _COMMAND_PREFIX_RE.sub("", content, count=1).strip()
|
|
|
|
|
|
def _derive_conversation_title_from_content(content: str | None) -> str:
|
|
"""Derive title from a string content (used when conv.messages is empty)."""
|
|
if content:
|
|
cleaned = _strip_command_prefix(content)
|
|
if cleaned:
|
|
return cleaned[:20] + ("..." if len(cleaned) > 20 else "")
|
|
return "对话"
|
|
|
|
|
|
@router.get("/portal/conversations/{conversation_id}")
|
|
async def get_conversation(
|
|
conversation_id: str, limit: int = 50, _auth: None = Depends(_verify_api_key)
|
|
):
|
|
"""Get conversation history from SQLite-backed store.
|
|
|
|
Title is derived from the first user message in `history` (the
|
|
authoritative source) rather than the in-memory cache, which may have
|
|
an empty `messages` list after a server restart. This prevents the
|
|
regression where selecting a conversation collapses the title to "对话".
|
|
"""
|
|
history = await _conversation_store.get_history(conversation_id, limit=limit)
|
|
if not history:
|
|
raise HTTPException(status_code=404, detail=f"Conversation '{conversation_id}' not found")
|
|
conv = await _conversation_store.get_or_create(conversation_id)
|
|
first_user_content = next(
|
|
(m.content for m in history if m.role == "user" and m.content),
|
|
None,
|
|
)
|
|
return {
|
|
"id": conv.id,
|
|
"title": _derive_conversation_title_from_content(first_user_content),
|
|
"messages": [_hydrate_persisted_message(conv.id, i, m) for i, m in enumerate(history)],
|
|
"created_at": conv.created_at.isoformat(),
|
|
"updated_at": conv.updated_at.isoformat(),
|
|
"is_board": any(
|
|
(m.metadata or {}).get("message_type") == "board_started"
|
|
for m in history
|
|
),
|
|
}
|
|
|
|
|
|
# Fields we store inside Message.metadata to reconstruct board_* messages
|
|
# after a page reload. The Message dataclass only has role/content/timestamp
|
|
# as first-class columns; everything else (message_type, expert identity,
|
|
# board round/role, conclusion payload) rides along in metadata.
|
|
_PERSISTED_MESSAGE_FIELDS = (
|
|
"message_type",
|
|
"expert_id",
|
|
"expert_name",
|
|
"expert_avatar",
|
|
"expert_color",
|
|
"board_round",
|
|
"board_role",
|
|
"board_conclusion",
|
|
"board_started",
|
|
"matched_skill",
|
|
"confidence",
|
|
"routing_method",
|
|
"thinking",
|
|
"tool_calls",
|
|
)
|
|
|
|
|
|
def _hydrate_persisted_message(conv_id: str, index: int, msg) -> dict:
|
|
"""Build the API response dict for a single persisted message.
|
|
|
|
Promotes well-known rendering fields from ``msg.metadata`` to the
|
|
top level so the frontend can render board_speech / board_summary /
|
|
board_conclusion cards after a reload — without this, every restored
|
|
assistant message would look like a plain chat bubble.
|
|
"""
|
|
payload: dict = {
|
|
"id": f"{conv_id}-{index}",
|
|
"role": msg.role,
|
|
"content": msg.content,
|
|
"timestamp": msg.timestamp.isoformat(),
|
|
"metadata": getattr(msg, "metadata", None) or {},
|
|
}
|
|
meta = payload["metadata"]
|
|
if not isinstance(meta, dict):
|
|
return payload
|
|
for key in _PERSISTED_MESSAGE_FIELDS:
|
|
if key in meta and meta[key] is not None:
|
|
payload[key] = meta[key]
|
|
return payload
|
|
|
|
|
|
@router.delete("/portal/conversations/{conversation_id}")
|
|
async def delete_conversation(conversation_id: str, _auth: None = Depends(_verify_api_key)):
|
|
"""Delete a conversation and all its messages.
|
|
|
|
ponytail: IDOR note — portal endpoints use API-key auth (single-tenant
|
|
access model: API key = full access to all conversations). The SQLite
|
|
store has no user_id column, so per-user ownership cannot be enforced
|
|
without a schema migration. If API keys become per-user, add a
|
|
user_id column to conversations + filter DELETE by (id, user_id).
|
|
Upgrade path: migrate portal endpoints to JWT auth + per-user scoping.
|
|
"""
|
|
deleted = await _conversation_store.delete_conversation(conversation_id)
|
|
if not deleted:
|
|
raise HTTPException(status_code=404, detail=f"Conversation '{conversation_id}' not found")
|
|
return {"deleted": True, "id": conversation_id}
|
|
|
|
|
|
def _derive_title_from_messages(messages: list) -> str:
|
|
"""Derive title from a list of Message objects (SessionManager format)."""
|
|
for msg in messages:
|
|
if msg.role.value == "user" and msg.content:
|
|
return msg.content[:20] + ("..." if len(msg.content) > 20 else "")
|
|
return "对话"
|
|
|
|
|
|
async def _execute_react_background(
|
|
react_engine: ReActEngine,
|
|
messages: list[dict],
|
|
tools: list,
|
|
model: str,
|
|
agent_name: str,
|
|
system_prompt: str | None,
|
|
timeout_seconds: float | None,
|
|
conv_id: str,
|
|
task_id: str,
|
|
event_queue: EventQueue,
|
|
conversation_store: SqliteConversationStore,
|
|
task_store: InMemoryTaskStore | None = None,
|
|
) -> None:
|
|
"""Execute ReAct engine in the background, decoupled from WebSocket lifecycle.
|
|
|
|
Events are emitted to the EventQueue (filtered by task_id) so that any
|
|
subscriber — including a reconnected WebSocket — can consume them.
|
|
Results are always persisted to the conversation store, regardless of
|
|
whether a WebSocket subscriber is active.
|
|
Task status is tracked in TaskStore when provided.
|
|
"""
|
|
collected_output: list[str] = []
|
|
try:
|
|
if task_store is not None:
|
|
try:
|
|
await _task_store_update_status(
|
|
task_store, task_id, TaskStatus.RUNNING, started_at=datetime.now(timezone.utc)
|
|
)
|
|
except (
|
|
ConnectionError,
|
|
OSError,
|
|
asyncio.TimeoutError,
|
|
ValueError,
|
|
KeyError,
|
|
RuntimeError,
|
|
):
|
|
logger.warning("Failed to update TaskStore RUNNING", exc_info=True)
|
|
|
|
async for event in react_engine.execute_stream(
|
|
messages=messages,
|
|
tools=tools,
|
|
model=model,
|
|
agent_name=agent_name,
|
|
system_prompt=system_prompt,
|
|
timeout_seconds=timeout_seconds,
|
|
):
|
|
if event.event_type == "final_answer":
|
|
collected_output.append(event.data.get("output", ""))
|
|
|
|
# P1 #8/#9/#10 fix: Preserve TurnEventType mapping, step field,
|
|
# and original data structure for EQ subscriber compatibility.
|
|
# Note: Event dataclass has no 'step' field; use getattr for
|
|
# compatibility with ReActEngine events that may include it.
|
|
_turn_event_type = _REACT_EVENT_TYPE_MAP.get(event.event_type)
|
|
if _turn_event_type is not None:
|
|
await _emit_event_safe(
|
|
event_queue,
|
|
_turn_event_type,
|
|
task_id=task_id,
|
|
session_id=conv_id,
|
|
data={
|
|
**event.data,
|
|
"step": getattr(event, "step", 0),
|
|
"timestamp": event.timestamp,
|
|
},
|
|
)
|
|
|
|
# Normal completion: persist result
|
|
response_text = _ensure_non_empty("".join(collected_output) if collected_output else None)
|
|
await conversation_store.add_message(conv_id, "assistant", response_text)
|
|
|
|
if task_store is not None:
|
|
try:
|
|
await _task_store_update_status(
|
|
task_store,
|
|
task_id,
|
|
TaskStatus.COMPLETED,
|
|
output_data={"output": response_text},
|
|
completed_at=datetime.now(timezone.utc),
|
|
progress=1.0,
|
|
progress_message="Completed",
|
|
)
|
|
except (
|
|
ConnectionError,
|
|
OSError,
|
|
asyncio.TimeoutError,
|
|
ValueError,
|
|
KeyError,
|
|
RuntimeError,
|
|
):
|
|
logger.warning("Failed to update TaskStore COMPLETED", exc_info=True)
|
|
|
|
# Emit task.completed so subscribers know the task is done
|
|
await _emit_event_safe(
|
|
event_queue,
|
|
TaskEventType.TASK_COMPLETED,
|
|
task_id=task_id,
|
|
session_id=conv_id,
|
|
data={"output": response_text, "timestamp": datetime.now(timezone.utc).isoformat()},
|
|
)
|
|
|
|
except asyncio.CancelledError:
|
|
# Application shutdown or explicit cancel — persist partial output
|
|
# and mark task as FAILED so resume does not block forever.
|
|
# P0 #1/#2 fix: ALL persistence operations must use asyncio.shield
|
|
# and the async TaskStore shim. Without shield, a re-entrant
|
|
# cancellation kills the cleanup itself; without the shim,
|
|
# RedisTaskStore (async) silently drops the coroutine.
|
|
if collected_output:
|
|
partial = _ensure_non_empty("".join(collected_output))
|
|
try:
|
|
await asyncio.shield(conversation_store.add_message(conv_id, "assistant", partial))
|
|
except (
|
|
asyncio.CancelledError,
|
|
ConnectionError,
|
|
OSError,
|
|
asyncio.TimeoutError,
|
|
ValueError,
|
|
KeyError,
|
|
RuntimeError,
|
|
):
|
|
logger.warning("Failed to persist partial output on cancel")
|
|
if task_store is not None:
|
|
try:
|
|
await asyncio.shield(
|
|
_task_store_update_status(
|
|
task_store,
|
|
task_id,
|
|
TaskStatus.FAILED,
|
|
error_message="Task cancelled",
|
|
completed_at=datetime.now(timezone.utc),
|
|
)
|
|
)
|
|
except (
|
|
asyncio.CancelledError,
|
|
ConnectionError,
|
|
OSError,
|
|
asyncio.TimeoutError,
|
|
ValueError,
|
|
KeyError,
|
|
RuntimeError,
|
|
):
|
|
logger.warning("Failed to update TaskStore on cancel", exc_info=True)
|
|
# P0 #2 fix: _emit_event_safe is async (it awaits event_queue.emit).
|
|
# Shield it so a re-entrant CancelledError doesn't kill the emit
|
|
# and leave subscribers blocked until timeout.
|
|
try:
|
|
await asyncio.shield(
|
|
_emit_event_safe(
|
|
event_queue,
|
|
TaskEventType.TASK_FAILED,
|
|
task_id=task_id,
|
|
session_id=conv_id,
|
|
data={
|
|
"error": "Task cancelled",
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
},
|
|
)
|
|
)
|
|
except (asyncio.CancelledError, asyncio.QueueFull, RuntimeError, ConnectionError):
|
|
logger.warning("Failed to emit TASK_FAILED on cancel")
|
|
raise # Propagate cancellation
|
|
|
|
except Exception as e:
|
|
# Persist any partial output collected before the error
|
|
if collected_output:
|
|
partial = _ensure_non_empty("".join(collected_output))
|
|
try:
|
|
await conversation_store.add_message(conv_id, "assistant", partial)
|
|
except (
|
|
ConnectionError,
|
|
OSError,
|
|
asyncio.TimeoutError,
|
|
ValueError,
|
|
KeyError,
|
|
RuntimeError,
|
|
):
|
|
logger.warning("Failed to persist partial output in background task")
|
|
|
|
if task_store is not None:
|
|
try:
|
|
await _task_store_update_status(
|
|
task_store,
|
|
task_id,
|
|
TaskStatus.FAILED,
|
|
error_message=str(e),
|
|
completed_at=datetime.now(timezone.utc),
|
|
)
|
|
except (
|
|
ConnectionError,
|
|
OSError,
|
|
asyncio.TimeoutError,
|
|
ValueError,
|
|
KeyError,
|
|
RuntimeError,
|
|
):
|
|
logger.warning("Failed to update TaskStore FAILED", exc_info=True)
|
|
|
|
# Emit task.failed so subscribers know the task failed
|
|
await _emit_event_safe(
|
|
event_queue,
|
|
TaskEventType.TASK_FAILED,
|
|
task_id=task_id,
|
|
session_id=conv_id,
|
|
data={"error": str(e), "timestamp": datetime.now(timezone.utc).isoformat()},
|
|
)
|
|
|
|
|
|
@router.websocket("/portal/ws")
|
|
async def portal_websocket(websocket: WebSocket):
|
|
"""Real-time chat WebSocket endpoint."""
|
|
await websocket.accept()
|
|
|
|
# ponytail: ws_user_id must be initialized before any early return — the
|
|
# finally block below references it. Previously the api_key reject path
|
|
# returned before assignment, causing UnboundLocalError that masked the
|
|
# original auth error. Upgrade path: refactor auth into a decorator.
|
|
ws_user_id: str | None = None
|
|
|
|
# Authentication (after accept, since FastAPI requires accept before close)
|
|
configured_api_key: str | None = None
|
|
if hasattr(websocket.app.state, "server_config") and websocket.app.state.server_config:
|
|
configured_api_key = websocket.app.state.server_config.api_key
|
|
if configured_api_key is None and hasattr(websocket.app.state, "api_key"):
|
|
configured_api_key = websocket.app.state.api_key
|
|
|
|
# Check api_key query param
|
|
if configured_api_key:
|
|
provided = websocket.query_params.get("api_key")
|
|
if not hmac.compare_digest((provided or "").encode(), configured_api_key.encode()):
|
|
await websocket.send_json(
|
|
{"type": "error", "data": {"message": "Invalid or missing api_key"}}
|
|
)
|
|
await websocket.close(code=4001, reason="Invalid or missing api_key")
|
|
return
|
|
|
|
# Track authenticated portal connections for user-scoped push (calendar
|
|
# reminders, etc.). user_id is None for API-key / dev-mode clients.
|
|
current_user = getattr(websocket.state, "current_user", None) or {}
|
|
ws_user_id = current_user.get("user_id")
|
|
if ws_user_id:
|
|
portal_connection_manager.add(ws_user_id, websocket)
|
|
|
|
# Wait for first chat message before creating conversation
|
|
conv: Conversation | None = None
|
|
# task_id is per-user-message; tracked here so the outer except can emit task.failed
|
|
task_id: str | None = None
|
|
# Track the active background task so cancel can propagate to it.
|
|
active_bg_task: asyncio.Task | None = None
|
|
|
|
try:
|
|
while True:
|
|
try:
|
|
timeout = _WS_HEARTBEAT_TIMEOUT if _WS_HEARTBEAT_TIMEOUT > 0 else None
|
|
raw = await asyncio.wait_for(websocket.receive_text(), timeout=timeout)
|
|
except asyncio.TimeoutError:
|
|
await websocket.close(code=1000, reason="Heartbeat timeout")
|
|
return
|
|
|
|
try:
|
|
msg = json.loads(raw)
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
msg_type = msg.get("type")
|
|
|
|
if msg_type == "cancel":
|
|
# Cancel the active background task if still running
|
|
if active_bg_task is not None and not active_bg_task.done():
|
|
active_bg_task.cancel()
|
|
active_bg_task = None
|
|
await websocket.send_json(
|
|
{
|
|
"type": "result",
|
|
"data": {
|
|
"status": "cancelled",
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
},
|
|
}
|
|
)
|
|
return
|
|
|
|
if msg_type == "ping":
|
|
await websocket.send_json({"type": "pong"})
|
|
continue
|
|
|
|
if msg_type == "resume":
|
|
# Frontend reconnected and wants to resume a running task
|
|
resume_task_id = msg.get("task_id", "")
|
|
if not resume_task_id:
|
|
continue
|
|
|
|
# P1 #3/#4 fix: Fail-closed ownership verification.
|
|
# Require conversation_id and TaskStore — reject if either
|
|
# is missing, to prevent cross-conversation task hijacking
|
|
# via empty conversation_id or unconfigured TaskStore.
|
|
resume_conv_id = msg.get("conversation_id", "")
|
|
if not resume_conv_id:
|
|
await websocket.send_json(
|
|
{
|
|
"type": "error",
|
|
"data": {
|
|
"message": "Resume requires conversation_id.",
|
|
"task_id": resume_task_id,
|
|
},
|
|
}
|
|
)
|
|
continue
|
|
|
|
resume_task_store: InMemoryTaskStore | None = getattr(
|
|
websocket.app.state, "task_store", None
|
|
)
|
|
resume_eq: EventQueue | None = getattr(websocket.app.state, "event_queue", None)
|
|
|
|
# P1 #4: Fail-closed if TaskStore is unavailable — cannot
|
|
# verify ownership without it.
|
|
if resume_task_store is None:
|
|
await websocket.send_json(
|
|
{
|
|
"type": "error",
|
|
"data": {
|
|
"message": "Resume not supported (TaskStore unavailable). Please retry your request.",
|
|
"task_id": resume_task_id,
|
|
},
|
|
}
|
|
)
|
|
continue
|
|
|
|
try:
|
|
record = await _task_store_get(resume_task_store, resume_task_id)
|
|
except (
|
|
ConnectionError,
|
|
OSError,
|
|
asyncio.TimeoutError,
|
|
ValueError,
|
|
KeyError,
|
|
RuntimeError,
|
|
):
|
|
logger.warning("TaskStore.get failed during resume", exc_info=True)
|
|
record = None
|
|
if record is not None:
|
|
# P1 #3: Fail-closed ownership check — reject if
|
|
# conversation_id is missing from task metadata OR
|
|
# does not match the request.
|
|
task_conv_id = (record.metadata or {}).get("conversation_id", "")
|
|
if not task_conv_id or resume_conv_id != task_conv_id:
|
|
logger.warning(
|
|
"Resume rejected: conversation_id mismatch "
|
|
"(task=%s, request=%s, task_id=%s)",
|
|
task_conv_id,
|
|
resume_conv_id,
|
|
resume_task_id,
|
|
)
|
|
await websocket.send_json(
|
|
{
|
|
"type": "error",
|
|
"data": {
|
|
"message": "Task does not belong to this conversation.",
|
|
"task_id": resume_task_id,
|
|
},
|
|
}
|
|
)
|
|
continue
|
|
if record.status == TaskStatus.COMPLETED:
|
|
# Task already finished — send result immediately
|
|
output = (record.output_data or {}).get("output", "")
|
|
await websocket.send_json(
|
|
{
|
|
"type": "result",
|
|
"data": {
|
|
"message": output,
|
|
"timestamp": record.completed_at.isoformat()
|
|
if record.completed_at
|
|
else datetime.now(timezone.utc).isoformat(),
|
|
},
|
|
}
|
|
)
|
|
continue
|
|
elif record.status == TaskStatus.FAILED:
|
|
await websocket.send_json(
|
|
{
|
|
"type": "error",
|
|
"data": {
|
|
"message": record.error_message or "Task failed",
|
|
},
|
|
}
|
|
)
|
|
continue
|
|
else:
|
|
# Task not found in store — cannot resume
|
|
await websocket.send_json(
|
|
{
|
|
"type": "error",
|
|
"data": {
|
|
"message": "Task not found or has expired. Please retry your request.",
|
|
"task_id": resume_task_id,
|
|
},
|
|
}
|
|
)
|
|
continue
|
|
|
|
# Task is still running — subscribe to EventQueue for remaining events.
|
|
# H6: if EventQueue is unavailable, inform the client instead of
|
|
# silently continuing (which would leave the UI loading forever).
|
|
if resume_eq is None:
|
|
await websocket.send_json(
|
|
{
|
|
"type": "error",
|
|
"data": {
|
|
"message": "Resume not supported (EventQueue unavailable). Please retry your request.",
|
|
},
|
|
}
|
|
)
|
|
continue
|
|
|
|
# C2: bound the subscribe loop with a timeout so a dead
|
|
# background task cannot block resume forever.
|
|
resume_timeout = _WS_HEARTBEAT_TIMEOUT * 10 if _WS_HEARTBEAT_TIMEOUT > 0 else 600
|
|
try:
|
|
async with asyncio.timeout(resume_timeout):
|
|
async for event in resume_eq.subscribe(task_id=resume_task_id):
|
|
if event.event_type == TaskEventType.TASK_COMPLETED:
|
|
response_text = event.data.get("output", EMPTY_LLM_RESPONSE)
|
|
await websocket.send_json(
|
|
{
|
|
"type": "result",
|
|
"data": {
|
|
"message": response_text,
|
|
"timestamp": event.data.get(
|
|
"timestamp",
|
|
datetime.now(timezone.utc).isoformat(),
|
|
),
|
|
},
|
|
}
|
|
)
|
|
break
|
|
elif event.event_type == TaskEventType.TASK_FAILED:
|
|
await websocket.send_json(
|
|
{
|
|
"type": "error",
|
|
"data": {
|
|
"message": event.data.get("error", "Unknown error"),
|
|
},
|
|
}
|
|
)
|
|
break
|
|
else:
|
|
# P1 #8/#10 fix: step and data are now
|
|
# top-level fields in event.data.
|
|
await websocket.send_json(
|
|
{
|
|
"type": "step",
|
|
"data": {
|
|
"event_type": event.event_type,
|
|
"step": event.data.get("step", 0),
|
|
"data": {
|
|
k: v
|
|
for k, v in event.data.items()
|
|
if k not in ("step", "timestamp")
|
|
},
|
|
"timestamp": event.data.get("timestamp", ""),
|
|
},
|
|
}
|
|
)
|
|
except TimeoutError:
|
|
logger.warning(f"Resume subscribe timed out for task {resume_task_id}")
|
|
await websocket.send_json(
|
|
{
|
|
"type": "error",
|
|
"data": {
|
|
"message": "Task resume timed out. Please retry your request.",
|
|
"task_id": resume_task_id,
|
|
},
|
|
}
|
|
)
|
|
except RuntimeError as exc:
|
|
# P1 #5: subscriber limit reached or EQ closed — send
|
|
# a friendly error instead of terminating the connection.
|
|
logger.warning("Resume subscribe failed for task %s: %s", resume_task_id, exc)
|
|
await websocket.send_json(
|
|
{
|
|
"type": "error",
|
|
"data": {
|
|
"message": "Server busy, please retry shortly.",
|
|
"task_id": resume_task_id,
|
|
},
|
|
}
|
|
)
|
|
continue
|
|
|
|
if msg_type != "chat":
|
|
continue
|
|
|
|
message_text = msg.get("message", "")
|
|
model_override = msg.get("model") # Frontend model selector
|
|
|
|
if not message_text:
|
|
continue
|
|
|
|
# Create or switch conversation based on conversation_id from frontend
|
|
conv_id = msg.get("conversation_id")
|
|
if conv_id:
|
|
if conv is None or conv.id != conv_id:
|
|
conv = await _conversation_store.get_or_create(conv_id)
|
|
await websocket.send_json({"type": "connected", "conversation_id": conv.id})
|
|
elif conv is None:
|
|
conv = await _conversation_store.get_or_create(conv_id)
|
|
await websocket.send_json({"type": "connected", "conversation_id": conv.id})
|
|
|
|
# @board / @team intercept — mirror chat.py:1076-1081.
|
|
# Frontend WS connects to /api/v1/portal/ws (this router), not
|
|
# /api/v1/chat/ws/{session_id} (chat.py). Without this intercept
|
|
# @board/@team messages are treated as plain text and no
|
|
# board_started/team_formed events are broadcast, so the cards
|
|
# never render. Placed before task_id emit so intercepted
|
|
# messages don't leave orphan tasks in the EQ side-channel.
|
|
if await _execute_board_meeting(websocket, conv.id, message_text, _sm_adapter):
|
|
continue
|
|
if await _execute_team_collab(websocket, conv.id, message_text, _sm_adapter):
|
|
continue
|
|
|
|
# Generate task_id for this user message and emit task.created to EQ
|
|
# (EQ is a side-channel: emit failures never break the WebSocket flow)
|
|
task_id = str(uuid.uuid4())
|
|
event_queue: EventQueue | None = getattr(websocket.app.state, "event_queue", None)
|
|
task_store: InMemoryTaskStore | None = getattr(websocket.app.state, "task_store", None)
|
|
await _emit_event_safe(
|
|
event_queue,
|
|
TaskEventType.TASK_CREATED,
|
|
task_id=task_id,
|
|
session_id=conv.id,
|
|
data={"message": message_text},
|
|
)
|
|
|
|
# Add user message to conversation
|
|
await _conversation_store.add_message(conv.id, "user", message_text)
|
|
start_time = datetime.now(timezone.utc)
|
|
|
|
async def _record_experience(
|
|
task_type: str, goal: str, outcome: str, duration_seconds: float
|
|
) -> None:
|
|
"""Record experience to dashboard after chat completion."""
|
|
try:
|
|
exp = DashboardExperience(
|
|
id=str(uuid.uuid4()),
|
|
task_type=task_type,
|
|
goal=goal[:200],
|
|
outcome=outcome,
|
|
duration_seconds=duration_seconds,
|
|
created_at=datetime.now(timezone.utc),
|
|
)
|
|
_dashboard_experiences.append(exp)
|
|
await _broadcast_dashboard_event(
|
|
"experience_added",
|
|
{
|
|
"id": exp.id,
|
|
"task_type": exp.task_type,
|
|
"goal": exp.goal,
|
|
"outcome": exp.outcome,
|
|
},
|
|
)
|
|
await _broadcast_dashboard_event("metrics_updated", {"period": "7d"})
|
|
except (
|
|
asyncio.QueueFull,
|
|
RuntimeError,
|
|
ConnectionError,
|
|
ValueError,
|
|
KeyError,
|
|
) as e:
|
|
logger.warning(f"Failed to record experience: {e}")
|
|
|
|
# Unified preprocessing via RequestPreprocessor (minimal: @skill prefix + greeting regex + REACT)
|
|
pool = websocket.app.state.agent_pool
|
|
skill_registry = websocket.app.state.skill_registry
|
|
llm_gateway = websocket.app.state.llm_gateway
|
|
request_preprocessor: RequestPreprocessor = websocket.app.state.request_preprocessor
|
|
|
|
all_skills = skill_registry.list_skills()
|
|
|
|
# Get default tools for RequestPreprocessor
|
|
default_tools = []
|
|
default_system_prompt = None
|
|
default_agent = pool.get_agent("default")
|
|
if default_agent is not None:
|
|
default_tools = default_agent.get_tools()
|
|
default_system_prompt = (
|
|
getattr(default_agent, "_system_prompt", None)
|
|
or default_agent.get_system_prompt()
|
|
)
|
|
else:
|
|
for skill in all_skills:
|
|
agent = pool.get_agent(skill.name)
|
|
if agent is not None:
|
|
default_tools = agent.get_tools()
|
|
default_system_prompt = (
|
|
getattr(agent, "_system_prompt", None) or agent.get_system_prompt()
|
|
)
|
|
break
|
|
|
|
# Preprocess via RequestPreprocessor (minimal: @skill prefix + greeting regex + REACT)
|
|
routing_result = await request_preprocessor.preprocess(
|
|
content=message_text,
|
|
skill_registry=skill_registry,
|
|
default_tools=default_tools,
|
|
default_system_prompt=default_system_prompt,
|
|
default_model=model_override or "default",
|
|
default_agent_name="default",
|
|
)
|
|
|
|
await websocket.send_json(
|
|
{
|
|
"type": "routing",
|
|
"skill": routing_result.agent_name or "default",
|
|
"method": routing_result.match_method or "intent",
|
|
"confidence": routing_result.match_confidence,
|
|
}
|
|
)
|
|
|
|
# Emit task.started to EQ (execution begins after routing)
|
|
await _emit_event_safe(
|
|
event_queue,
|
|
TaskEventType.TASK_STARTED,
|
|
task_id=task_id,
|
|
session_id=conv.id,
|
|
data={
|
|
"agent_name": routing_result.agent_name or "default",
|
|
"execution_mode": routing_result.execution_mode.value
|
|
if hasattr(routing_result.execution_mode, "value")
|
|
else str(routing_result.execution_mode),
|
|
},
|
|
)
|
|
|
|
# Register task in TaskStore for status tracking and recovery
|
|
if task_store is not None:
|
|
try:
|
|
await _task_store_create(
|
|
task_store,
|
|
task_id=task_id,
|
|
agent_name=routing_result.agent_name or "default",
|
|
input_data={"message": message_text},
|
|
skill_name=routing_result.skill_name,
|
|
)
|
|
# Store conversation_id in metadata for frontend recovery
|
|
await _task_store_update_status(
|
|
task_store,
|
|
task_id,
|
|
TaskStatus.PENDING,
|
|
metadata={"conversation_id": conv.id},
|
|
)
|
|
except (
|
|
ConnectionError,
|
|
OSError,
|
|
asyncio.TimeoutError,
|
|
ValueError,
|
|
KeyError,
|
|
RuntimeError,
|
|
):
|
|
logger.warning("Failed to register task in TaskStore", exc_info=True)
|
|
|
|
# Execute based on routing result's execution_mode
|
|
# This is the single source of truth for path selection,
|
|
# replacing fragile string-matching on match_method.
|
|
if routing_result.execution_mode == ExecutionMode.DIRECT_CHAT:
|
|
# Zero-cost path: direct LLM call, no ReAct loop
|
|
chat_messages = []
|
|
# Inject system prompt (contains SOUL/USER/MEMORY/DAILY) for identity continuity
|
|
if routing_result.system_prompt:
|
|
chat_messages.append(
|
|
{"role": "system", "content": routing_result.system_prompt}
|
|
)
|
|
chat_messages.append({"role": "user", "content": message_text})
|
|
# Inject conversation history for context continuity
|
|
history_msgs = await _build_history_messages(conv.id)
|
|
for hm in history_msgs:
|
|
chat_messages.insert(-1, hm)
|
|
response = await llm_gateway.chat(
|
|
messages=chat_messages,
|
|
model=model_override or "default",
|
|
agent_name="default",
|
|
task_type="chat",
|
|
)
|
|
# Store assistant reply for multi-turn context continuity
|
|
response_content = _ensure_non_empty(response.content)
|
|
await _conversation_store.add_message(conv.id, "assistant", response_content)
|
|
|
|
# Update TaskStore status to COMPLETED
|
|
if task_store is not None:
|
|
try:
|
|
await _task_store_update_status(
|
|
task_store,
|
|
task_id,
|
|
TaskStatus.COMPLETED,
|
|
output_data={"output": response_content},
|
|
completed_at=datetime.now(timezone.utc),
|
|
progress=1.0,
|
|
progress_message="Completed",
|
|
)
|
|
except (
|
|
ConnectionError,
|
|
OSError,
|
|
asyncio.TimeoutError,
|
|
ValueError,
|
|
KeyError,
|
|
RuntimeError,
|
|
):
|
|
logger.warning("Failed to update TaskStore for DIRECT_CHAT", exc_info=True)
|
|
|
|
# Emit turn.final_answer and task.completed to EQ
|
|
await _emit_event_safe(
|
|
event_queue,
|
|
TurnEventType.FINAL_ANSWER,
|
|
task_id=task_id,
|
|
session_id=conv.id,
|
|
data={"output": response_content},
|
|
)
|
|
await _emit_event_safe(
|
|
event_queue,
|
|
TaskEventType.TASK_COMPLETED,
|
|
task_id=task_id,
|
|
session_id=conv.id,
|
|
data={"output": response_content},
|
|
)
|
|
|
|
await websocket.send_json(
|
|
{
|
|
"type": "result",
|
|
"data": {
|
|
"message": response_content,
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
},
|
|
}
|
|
)
|
|
await _record_experience(
|
|
"chat",
|
|
message_text,
|
|
"success",
|
|
(datetime.now(timezone.utc) - start_time).total_seconds(),
|
|
)
|
|
continue
|
|
|
|
# REACT / SKILL_REACT / REWOO / REFLEXION / PLAN_EXEC / TEAM_COLLAB
|
|
# Advanced modes fall back to REACT with a warning.
|
|
if routing_result.execution_mode not in (
|
|
ExecutionMode.REACT,
|
|
ExecutionMode.SKILL_REACT,
|
|
):
|
|
logger.warning(
|
|
f"Execution mode {routing_result.execution_mode.value} not yet supported "
|
|
f"in portal WebSocket, falling back to REACT"
|
|
)
|
|
|
|
agent_name = routing_result.agent_name or "default"
|
|
agent = pool.get_agent(agent_name)
|
|
if agent is None:
|
|
# Agent not in pool — fall back to direct chat.
|
|
# This handles the case where routing returned an agent_name
|
|
# that doesn't exist in the pool (e.g. "default" or a
|
|
# skill that hasn't been instantiated yet).
|
|
logger.info(
|
|
f"Session {conv.id}: agent '{agent_name}' not in pool, falling back to direct chat"
|
|
)
|
|
chat_messages = []
|
|
# Inject system prompt (contains SOUL/USER/MEMORY/DAILY) for identity continuity
|
|
if routing_result.system_prompt:
|
|
chat_messages.append(
|
|
{"role": "system", "content": routing_result.system_prompt}
|
|
)
|
|
chat_messages.append({"role": "user", "content": message_text})
|
|
try:
|
|
history = await _conversation_store.get_history(conv.id, limit=20)
|
|
for hist_msg in history[:-1]:
|
|
if hist_msg.role in ("user", "assistant"):
|
|
chat_messages.insert(
|
|
-1, {"role": hist_msg.role, "content": hist_msg.content}
|
|
)
|
|
except (
|
|
ConnectionError,
|
|
OSError,
|
|
asyncio.TimeoutError,
|
|
ValueError,
|
|
KeyError,
|
|
RuntimeError,
|
|
):
|
|
pass
|
|
response = await llm_gateway.chat(
|
|
messages=chat_messages,
|
|
model=model_override or "default",
|
|
agent_name="default",
|
|
task_type="chat",
|
|
)
|
|
# Store assistant reply for multi-turn context continuity
|
|
response_content = _ensure_non_empty(response.content)
|
|
await _conversation_store.add_message(conv.id, "assistant", response_content)
|
|
|
|
# Emit turn.final_answer and task.completed to EQ (fallback path)
|
|
await _emit_event_safe(
|
|
event_queue,
|
|
TurnEventType.FINAL_ANSWER,
|
|
task_id=task_id,
|
|
session_id=conv.id,
|
|
data={"output": response_content},
|
|
)
|
|
await _emit_event_safe(
|
|
event_queue,
|
|
TaskEventType.TASK_COMPLETED,
|
|
task_id=task_id,
|
|
session_id=conv.id,
|
|
data={"output": response_content},
|
|
)
|
|
|
|
await websocket.send_json(
|
|
{
|
|
"type": "result",
|
|
"data": {
|
|
"status": "completed",
|
|
"content": response_content,
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
},
|
|
}
|
|
)
|
|
await _record_experience(
|
|
"chat",
|
|
message_text,
|
|
"success",
|
|
(datetime.now(timezone.utc) - start_time).total_seconds(),
|
|
)
|
|
continue
|
|
|
|
# Execute via ReAct stream
|
|
react_config = agent.get_react_config()
|
|
# Reuse agent's ReActEngine if available (aligned with chat.py pattern)
|
|
react_engine = getattr(agent, "_react_engine", None)
|
|
if react_engine is None:
|
|
react_engine = ReActEngine(
|
|
llm_gateway=llm_gateway,
|
|
max_steps=react_config["max_steps"],
|
|
)
|
|
else:
|
|
react_engine.reset()
|
|
|
|
messages = [{"role": "user", "content": message_text}]
|
|
# Inject conversation history for context continuity
|
|
history_msgs = await _build_history_messages(conv.id)
|
|
for hm in reversed(history_msgs):
|
|
messages.insert(0, hm)
|
|
tools = agent.get_tools()
|
|
model = model_override or agent.get_model()
|
|
system_prompt = getattr(agent, "_system_prompt", None) or agent.get_system_prompt()
|
|
timeout_seconds = react_config["timeout_seconds"]
|
|
logger.info(
|
|
f"[portal] agent='{agent_name}' tools={len(tools)} "
|
|
f"[{', '.join(t.name for t in tools)}] model={model}"
|
|
)
|
|
|
|
# Start ReAct execution as a background task, decoupled from
|
|
# WebSocket lifecycle. When the WebSocket disconnects, the
|
|
# background task continues running and persists the result.
|
|
bg_task = asyncio.create_task(
|
|
_execute_react_background(
|
|
react_engine=react_engine,
|
|
messages=messages,
|
|
tools=tools,
|
|
model=model,
|
|
agent_name=agent.name,
|
|
system_prompt=system_prompt,
|
|
timeout_seconds=timeout_seconds,
|
|
conv_id=conv.id,
|
|
task_id=task_id,
|
|
event_queue=event_queue,
|
|
conversation_store=_conversation_store,
|
|
task_store=task_store,
|
|
)
|
|
)
|
|
_running_background_tasks.add(bg_task)
|
|
bg_task.add_done_callback(_running_background_tasks.discard)
|
|
active_bg_task = bg_task
|
|
|
|
# C1 guard: EventQueue is required for subscribe; fall back to
|
|
# awaiting the background task directly if unavailable.
|
|
if event_queue is None:
|
|
logger.warning("EventQueue not configured; awaiting background task directly")
|
|
try:
|
|
await bg_task
|
|
except (RuntimeError, ConnectionError, asyncio.TimeoutError):
|
|
pass # errors handled inside _execute_react_background
|
|
active_bg_task = None
|
|
continue
|
|
|
|
# Subscribe to EventQueue (filtered by task_id) and forward
|
|
# events to the WebSocket. When the WebSocket disconnects,
|
|
# this loop exits but the background task continues.
|
|
# P1 #7 fix: bound the subscribe loop with a timeout so a
|
|
# hung background task cannot block the WebSocket forever.
|
|
# Matches the resume path's timeout strategy.
|
|
_subscribe_timeout = _WS_HEARTBEAT_TIMEOUT * 10 if _WS_HEARTBEAT_TIMEOUT > 0 else 600
|
|
try:
|
|
async with asyncio.timeout(_subscribe_timeout):
|
|
async for event in event_queue.subscribe(task_id=task_id):
|
|
if event.event_type == TaskEventType.TASK_COMPLETED:
|
|
response_text = event.data.get("output", EMPTY_LLM_RESPONSE)
|
|
await websocket.send_json(
|
|
{
|
|
"type": "result",
|
|
"data": {
|
|
"message": response_text,
|
|
"timestamp": event.data.get(
|
|
"timestamp",
|
|
datetime.now(timezone.utc).isoformat(),
|
|
),
|
|
},
|
|
}
|
|
)
|
|
await _record_experience(
|
|
routing_result.skill_name or "agent",
|
|
message_text,
|
|
"success" if response_text != EMPTY_LLM_RESPONSE else "failure",
|
|
(datetime.now(timezone.utc) - start_time).total_seconds(),
|
|
)
|
|
break
|
|
elif event.event_type == TaskEventType.TASK_FAILED:
|
|
await websocket.send_json(
|
|
{
|
|
"type": "error",
|
|
"data": {
|
|
"message": event.data.get("error", "Unknown error"),
|
|
},
|
|
}
|
|
)
|
|
await _record_experience(
|
|
routing_result.skill_name or "agent",
|
|
message_text,
|
|
"failure",
|
|
(datetime.now(timezone.utc) - start_time).total_seconds(),
|
|
)
|
|
break
|
|
else:
|
|
# Forward ReAct events as step messages.
|
|
# P1 #8/#10 fix: step and data are now top-level
|
|
# fields in event.data (no longer nested).
|
|
await websocket.send_json(
|
|
{
|
|
"type": "step",
|
|
"data": {
|
|
"event_type": event.event_type,
|
|
"step": event.data.get("step", 0),
|
|
"data": {
|
|
k: v
|
|
for k, v in event.data.items()
|
|
if k not in ("step", "timestamp")
|
|
},
|
|
"timestamp": event.data.get("timestamp", ""),
|
|
},
|
|
}
|
|
)
|
|
except TimeoutError:
|
|
logger.warning(f"Subscribe loop timed out for task {task_id}")
|
|
if active_bg_task is not None and not active_bg_task.done():
|
|
active_bg_task.cancel()
|
|
await websocket.send_json(
|
|
{
|
|
"type": "error",
|
|
"data": {
|
|
"message": "Task timed out. Please retry your request.",
|
|
"task_id": task_id,
|
|
},
|
|
}
|
|
)
|
|
except RuntimeError as exc:
|
|
# P1 #5: subscriber limit reached or EQ closed — send
|
|
# a friendly error instead of terminating the connection.
|
|
logger.warning("Subscribe failed for task %s: %s", task_id, exc)
|
|
await websocket.send_json(
|
|
{
|
|
"type": "error",
|
|
"data": {
|
|
"message": "Server busy, please retry shortly.",
|
|
"task_id": task_id,
|
|
},
|
|
}
|
|
)
|
|
|
|
except WebSocketDisconnect:
|
|
logger.debug(f"Portal WebSocket disconnected for conversation {conv.id if conv else 'N/A'}")
|
|
# P0 fix: Do NOT cancel the background task on disconnect.
|
|
# The entire purpose of the three-layer defense is to let the
|
|
# background task continue running and persist the result so the
|
|
# frontend can resume it after reconnection. Cancelling here would
|
|
# kill the task, lose the full output, and mark it FAILED —
|
|
# defeating layers 2 and 3. The task is only cancelled on explicit
|
|
# user cancel (msg_type == 'cancel') or application shutdown.
|
|
except asyncio.CancelledError:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Portal WebSocket error: {e}")
|
|
# P1 #6 fix: Do NOT cancel the background task on connection-level
|
|
# errors (ConnectionResetError, BrokenPipeError, etc.). These are
|
|
# functionally equivalent to WebSocketDisconnect — the client dropped
|
|
# — and the background task must survive to persist its result.
|
|
# Only cancel on truly unexpected errors that may have corrupted
|
|
# state needed by the background task.
|
|
if not isinstance(e, (ConnectionResetError, BrokenPipeError, ConnectionError)):
|
|
if active_bg_task is not None and not active_bg_task.done():
|
|
active_bg_task.cancel()
|
|
# Emit task.failed to EQ if a task was in progress
|
|
# (task_id is set when a user message is received; None before that)
|
|
if task_id is not None and conv is not None:
|
|
event_queue = getattr(websocket.app.state, "event_queue", None)
|
|
await _emit_event_safe(
|
|
event_queue,
|
|
TaskEventType.TASK_FAILED,
|
|
task_id=task_id,
|
|
session_id=conv.id,
|
|
data={"error": str(e)},
|
|
)
|
|
try:
|
|
await websocket.send_json({"type": "error", "data": {"message": str(e)}})
|
|
except (ConnectionError, RuntimeError, asyncio.TimeoutError):
|
|
pass
|
|
finally:
|
|
# Remove from user-scoped push tracking on any disconnect/error/return.
|
|
if ws_user_id:
|
|
portal_connection_manager.remove(ws_user_id, websocket)
|