fischer-agentkit/src/agentkit/server/routes/portal.py

import asyncio
import hmac
import json
import re
import logging
import os
import uuid
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path

from fastapi import (
    APIRouter,
    Depends,
    HTTPException,
    Request,
    WebSocket,
    WebSocketDisconnect,
    Security,
)
from fastapi.security import APIKeyHeader, APIKeyQuery
from pydantic import BaseModel

from agentkit.core.config_driven import ConfigDrivenAgent
from agentkit.core.event_queue import EventQueue
from agentkit.core.protocol import Event, TaskEventType, TaskStatus, TurnEventType
from agentkit.core.react import ReActEngine
from agentkit.chat.skill_routing import ExecutionMode, SkillRoutingResult
from agentkit.chat.request_preprocessor import RequestPreprocessor
from agentkit.server.routes.evolution_dashboard import (
    _experiences as _dashboard_experiences,
    DashboardExperience,
    _broadcast_event as _broadcast_dashboard_event,
)
from agentkit.core.fallback import EMPTY_LLM_RESPONSE
from agentkit.chat.sqlite_conversation_store import SqliteConversationStore
from agentkit.server.task_store import InMemoryTaskStore
from agentkit.session.models import MessageRole

# ponytail: importing module-private helpers from chat.py because the frontend
# WS connects to /api/v1/portal/ws (this router), not /api/v1/chat/ws/{session_id}.
# Without this, @board/@team prefixes are never intercepted and board/team cards
# never render. Upgrade path: extract these into a shared experts dispatch module.
from agentkit.server.routes.chat import _execute_board_meeting, _execute_team_collab

logger = logging.getLogger(__name__)

router = APIRouter(tags=["portal"])

# Use a project-local SQLite file to avoid read-only sandbox restrictions on ~/.agentkit.
_PROJECT_ROOT = Path(__file__).parents[4]
_CONVERSATIONS_DB_PATH = Path(
    os.environ.get("AGENTKIT_CONVERSATIONS_DB", _PROJECT_ROOT / "data" / "conversations.db")
)
_CONVERSATIONS_DB_PATH.parent.mkdir(parents=True, exist_ok=True)

# Track background ReAct tasks so they are not garbage-collected mid-execution.
# Tasks are removed automatically via add_done_callback when they complete.
_running_background_tasks: set[asyncio.Task] = set()

# ---------------------------------------------------------------------------
# API Key Authentication
# ---------------------------------------------------------------------------

_api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
_api_key_query = APIKeyQuery(name="api_key", auto_error=False)


def _ensure_non_empty(text: str | None) -> str:
    """Ensure response text is never empty or whitespace-only."""
    if text and text.strip():
        return text
    return EMPTY_LLM_RESPONSE


async def _emit_event_safe(
    event_queue: EventQueue | None,
    event_type: str,
    task_id: str,
    session_id: str,
    data: dict | None = None,
) -> None:
    """Emit an event to the EventQueue without blocking or raising.

    The EQ is a side-channel: emit failures must never break the WebSocket flow.
    All exceptions are swallowed and logged at warning level.

    Args:
        event_queue: The EventQueue to emit to (no-op if None)
        event_type: Event type (see TaskEventType / TurnEventType)
        task_id: Associated task ID
        session_id: Associated session ID (conversation_id)
        data: Optional event payload
    """
    if event_queue is None:
        return
    try:
        event = Event.create(
            event_type=event_type,
            task_id=task_id,
            session_id=session_id,
            data=data or {},
        )
        await event_queue.emit(event)
    except (asyncio.QueueFull, RuntimeError, ConnectionError) as e:
        logger.warning(f"EventQueue emit failed (type={event_type}): {e}", exc_info=True)


# P1 #14 fix: TaskStore sync/async compatibility shim.
# InMemoryTaskStore methods are sync; RedisTaskStore methods are async.
# These helpers detect and await coroutines so portal.py works with both.
async def _task_store_create(store, *args, **kwargs):
    result = store.create(*args, **kwargs)
    if asyncio.iscoroutine(result):
        return await result
    return result


async def _task_store_get(store, *args, **kwargs):
    result = store.get(*args, **kwargs)
    if asyncio.iscoroutine(result):
        return await result
    return result


async def _task_store_update_status(store, *args, **kwargs):
    result = store.update_status(*args, **kwargs)
    if asyncio.iscoroutine(result):
        return await result
    return result


async def _task_store_list_tasks(store, *args, **kwargs):
    result = store.list_tasks(*args, **kwargs)
    if asyncio.iscoroutine(result):
        return await result
    return result


async def _verify_api_key(
    request: Request,
    api_key_header: str | None = Security(_api_key_header),
    api_key_query: str | None = Security(_api_key_query),
) -> None:
    """Verify API key for REST endpoints. Raises HTTPException if invalid."""
    configured_api_key: str | None = None
    if hasattr(request.app.state, "server_config") and request.app.state.server_config:
        configured_api_key = request.app.state.server_config.api_key
    if configured_api_key is None and hasattr(request.app.state, "api_key"):
        configured_api_key = request.app.state.api_key

    # If no API key is configured, allow all requests (backwards compat)
    if configured_api_key is None:
        return

    provided = api_key_header or api_key_query
    if not hmac.compare_digest((provided or "").encode(), configured_api_key.encode()):
        raise HTTPException(
            status_code=401,
            detail="Invalid or missing API key. Provide via X-API-Key header or api_key query parameter.",
        )


# ---------------------------------------------------------------------------
# In-memory Conversation Store
# ---------------------------------------------------------------------------


@dataclass
class ChatMessage:
    role: str  # "user" or "assistant"
    content: str
    timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
    metadata: dict = field(default_factory=dict)


@dataclass
class Conversation:
    id: str
    messages: list[ChatMessage] = field(default_factory=list)
    created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
    updated_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))


# Heartbeat timeout in seconds — 0 disables timeout (for testing)
_WS_HEARTBEAT_TIMEOUT = float(os.environ.get("AGENTKIT_WS_TIMEOUT", "120"))
_conversation_store = SqliteConversationStore(db_path=_CONVERSATIONS_DB_PATH)


class _ConvStoreAsSessionManager:
    """Adapt SqliteConversationStore to the SessionManager.append_message shape
    used by chat.py's _execute_board_meeting / _execute_team_collab.

    ponytail: only append_message is implemented — that's all the board/team
    intercepts call. If future logic needs more SessionManager methods, add
    them here then.
    """

    def __init__(self, store: SqliteConversationStore) -> None:
        self._store = store

    async def append_message(
        self,
        session_id: str,
        role: MessageRole,
        content: str,
        tool_call_id: str | None = None,
        agent_name: str | None = None,
        metadata: dict[str, object] | None = None,
    ) -> None:
        role_str = role.value if hasattr(role, "value") else str(role)
        await self._store.add_message(session_id, role_str, content, metadata)


_sm_adapter = _ConvStoreAsSessionManager(_conversation_store)


# ---------------------------------------------------------------------------
# Active portal WebSocket connections by user_id
# ---------------------------------------------------------------------------


class PortalConnectionManager:
    """Track active portal WebSocket connections by authenticated user_id.

    Used by the calendar reminder scheduler (and other user-scoped push
    features) to deliver real-time messages to a user's open chat tab(s).
    """

    # ponytail: per-user connection cap prevents a single client from
    # exhausting memory via unbounded WS spawns. 16 covers typical
    # multi-tab usage. Upgrade path: make configurable via server_config.
    _MAX_CONNECTIONS_PER_USER = 16

    def __init__(self) -> None:
        # user_id -> list of active WebSocket connections
        self._connections: dict[str, list[WebSocket]] = {}

    def add(self, user_id: str, ws: WebSocket) -> None:
        conns = self._connections.setdefault(user_id, [])
        if len(conns) >= self._MAX_CONNECTIONS_PER_USER:
            # Close the oldest connection to make room (FIFO eviction).
            oldest = conns.pop(0)
            try:
                # Best-effort close; ignore failures since the socket may
                # already be dead.
                import asyncio

                asyncio.create_task(oldest.close(code=1008, reason="Connection limit exceeded"))
            except (ConnectionError, RuntimeError):
                pass
        conns.append(ws)

    def remove(self, user_id: str, ws: WebSocket) -> None:
        conns = self._connections.get(user_id)
        if conns is None:
            return
        self._connections[user_id] = [w for w in conns if w is not ws]
        if not self._connections[user_id]:
            del self._connections[user_id]

    async def send_json(self, user_id: str, message: dict[str, object]) -> None:
        """Broadcast a JSON message to all connections for *user_id*.

        Removes stale connections that fail to send.
        """
        conns = list(self._connections.get(user_id, []))
        if not conns:
            return
        stale: list[WebSocket] = []
        for ws in conns:
            try:
                await ws.send_json(message)
            except (ConnectionError, RuntimeError, asyncio.TimeoutError) as e:
                logger.debug("Portal WS send failed for user %s (marking stale): %s", user_id, e)
                stale.append(ws)
        for ws in stale:
            self.remove(user_id, ws)


portal_connection_manager = PortalConnectionManager()


async def send_to_user(user_id: str, message: dict[str, object]) -> None:
    """Public helper to push a message to all portal WebSockets for a user."""
    await portal_connection_manager.send_json(user_id, message)


# P1 #9 fix: ReAct event type -> TurnEventType mapping for EQ subscribers.
# Preserves the original EQ contract so CLI and other subscribers that
# filter on TurnEventType constants (e.g. 'turn.thinking') keep working.
_REACT_EVENT_TYPE_MAP: dict[str, str] = {
    "thinking": TurnEventType.THINKING,
    "tool_call": TurnEventType.TOOL_CALL,
    "tool_result": TurnEventType.TOOL_RESULT,
    "token": TurnEventType.TOKEN,
    "final_answer": TurnEventType.FINAL_ANSWER,
    "error": TurnEventType.TURN_COMPLETED,  # best-effort mapping
    "confirmation_request": TurnEventType.STEP,
}

# ---------------------------------------------------------------------------
# History injection helper — configurable limit + optional compression
# ---------------------------------------------------------------------------

# Maximum history messages to inject (can be overridden by server config)
_MAX_HISTORY_MESSAGES = 50


async def _build_history_messages(
    conv_id: str,
    limit: int = _MAX_HISTORY_MESSAGES,
) -> list[dict]:
    """Build conversation history messages for LLM context injection.

    Returns a list of {"role": "user"|"assistant", "content": ...} dicts
    representing the conversation history (excluding the current user message,
    which should be appended separately by the caller).
    """
    try:
        history = await _conversation_store.get_history(conv_id, limit=limit)
    except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError):
        return []

    # The last message in history is the current user message (just added),
    # so skip it to avoid duplication.
    messages = []
    for hist_msg in history[:-1]:
        if hist_msg.role in ("user", "assistant"):
            messages.append({"role": hist_msg.role, "content": hist_msg.content})
    return messages


# ---------------------------------------------------------------------------
# Capability mapping
# ---------------------------------------------------------------------------

CAPABILITY_CATEGORIES: dict[str, dict[str, str]] = {
    "chat": {
        "display_name": "智能对话",
        "description": "自然语言交互，自动路由到对应能力",
        "icon": "MessageOutlined",
    },
    "workflow": {
        "display_name": "工作流编排",
        "description": "可视化拖拽编排工作流",
        "icon": "ApartmentOutlined",
    },
    "knowledge": {
        "display_name": "知识库",
        "description": "文档摄取、语义检索、多源RAG",
        "icon": "BookOutlined",
    },
    "skills": {
        "display_name": "技能管理",
        "description": "浏览和管理已注册的技能",
        "icon": "AppstoreOutlined",
    },
    "terminal": {
        "display_name": "智能终端",
        "description": "交互式终端会话和命令执行",
        "icon": "CodeOutlined",
    },
    "computer_use": {
        "display_name": "Computer Use",
        "description": "UI自动化操作和截屏识别",
        "icon": "DesktopOutlined",
    },
    "evolution": {
        "display_name": "自进化",
        "description": "经验积累、避坑预警、路径优化",
        "icon": "RiseOutlined",
    },
    "settings": {
        "display_name": "系统设置",
        "description": "配置LLM、技能、知识库连接",
        "icon": "SettingOutlined",
    },
}


# ---------------------------------------------------------------------------
# Request / Response models
# ---------------------------------------------------------------------------


class ChatRequest(BaseModel):
    message: str
    conversation_id: str | None = None
    sources: list[str] | None = None
    skill_name: str | None = None


class ChatResponse(BaseModel):
    conversation_id: str
    message: str
    timestamp: str
    matched_skill: str | None = None
    routing_method: str | None = None
    confidence: float | None = None
    task_id: str | None = None
    status: str = "completed"


class CapabilityInfo(BaseModel):
    name: str
    display_name: str
    description: str
    icon: str
    enabled: bool
    skill_count: int


class CapabilitiesResponse(BaseModel):
    capabilities: list[CapabilityInfo]


# ---------------------------------------------------------------------------
# Helper: resolve agent + skill for a chat request
# ---------------------------------------------------------------------------


async def _resolve_for_chat(
    request: ChatRequest, req: Request
) -> tuple[
    ConfigDrivenAgent | None, SkillRoutingResult | None, str | None, str | None, float | None
]:
    """Resolve agent and routing for a chat request via RequestPreprocessor.

    Returns (agent, routing_result, matched_skill_name, routing_method, confidence).
    """
    pool = req.app.state.agent_pool
    skill_registry = req.app.state.skill_registry
    request_preprocessor: RequestPreprocessor = req.app.state.request_preprocessor

    matched_skill_name: str | None = None
    routing_method: str | None = None
    confidence: float | None = None

    # Get default tools and system prompt
    default_tools = []
    default_system_prompt = None
    default_agent = pool.get_agent("default")
    if default_agent is not None:
        default_tools = default_agent.get_tools()
        default_system_prompt = (
            getattr(default_agent, "_system_prompt", None) or default_agent.get_system_prompt()
        )
    else:
        all_skills = skill_registry.list_skills()
        for skill in all_skills:
            agent = pool.get_agent(skill.name)
            if agent is not None:
                default_tools = agent.get_tools()
                default_system_prompt = (
                    getattr(agent, "_system_prompt", None) or agent.get_system_prompt()
                )
                break

    # If skill_name is explicitly provided in the request, use it directly
    if request.skill_name:
        routing_result = await request_preprocessor.preprocess(
            content=f"@skill:{request.skill_name} {request.message}",
            skill_registry=skill_registry,
            default_tools=default_tools,
            default_system_prompt=default_system_prompt,
            default_model="default",
            default_agent_name="default",
        )
    else:
        # Preprocess via RequestPreprocessor (minimal: @skill prefix + greeting regex + REACT)
        routing_result = await request_preprocessor.preprocess(
            content=request.message,
            skill_registry=skill_registry,
            default_tools=default_tools,
            default_system_prompt=default_system_prompt,
            default_model="default",
            default_agent_name="default",
        )

    matched_skill_name = routing_result.skill_name or routing_result.agent_name
    routing_method = routing_result.match_method
    confidence = routing_result.match_confidence

    # Get or create agent based on routing result
    if routing_result.matched and routing_result.skill_name:
        agent = pool.get_agent(routing_result.skill_name)
        if agent is None:
            agent = await pool.create_agent_from_skill(routing_result.skill_name)
    else:
        agent = pool.get_agent("default")
        if agent is None:
            # Fallback: try to create from first available skill
            all_skills = skill_registry.list_skills()
            if all_skills:
                agent = await pool.create_agent_from_skill(all_skills[0].name)

    return agent, routing_result, matched_skill_name, routing_method, confidence


# ---------------------------------------------------------------------------
# Endpoints
# ---------------------------------------------------------------------------


@router.post("/portal/chat", response_model=ChatResponse)
async def chat(request: ChatRequest, req: Request, _auth: None = Depends(_verify_api_key)):
    """Send a chat message and get a response with RequestPreprocessor routing."""
    # If skill_name is explicitly requested but not found, return 404
    if request.skill_name:
        skill_registry = req.app.state.skill_registry
        if not skill_registry.has_skill(request.skill_name):
            raise HTTPException(status_code=404, detail=f"Skill '{request.skill_name}' not found")

    agent, routing_result, matched_skill, routing_method, confidence = await _resolve_for_chat(
        request, req
    )

    # Create or reuse conversation
    conv = await _conversation_store.get_or_create(request.conversation_id)
    await _conversation_store.add_message(conv.id, "user", request.message)

    llm_gateway = req.app.state.llm_gateway

    task_id = str(uuid.uuid4())
    response_text = ""

    if routing_result is not None and routing_result.execution_mode == ExecutionMode.DIRECT_CHAT:
        # DIRECT_CHAT: direct LLM call, no ReAct loop (same as WebSocket path)
        chat_messages = []
        if routing_result.system_prompt:
            chat_messages.append({"role": "system", "content": routing_result.system_prompt})
        chat_messages.append({"role": "user", "content": request.message})
        # Inject conversation history
        history_msgs = await _build_history_messages(conv.id)
        for hm in history_msgs:
            chat_messages.insert(-1, hm)
        response = await llm_gateway.chat(
            messages=chat_messages,
            model=routing_result.model or "default",
            agent_name="default",
            task_type="chat",
        )
        response_text = _ensure_non_empty(response.content)
    else:
        # REACT / SKILL_REACT / REWOO / REFLEXION / PLAN_EXEC / TEAM_COLLAB
        # Advanced modes (REWOO, REFLEXION, PLAN_EXEC, TEAM_COLLAB) currently
        # fall back to REACT with a warning. Full integration is tracked separately.
        if routing_result is not None and routing_result.execution_mode not in (
            ExecutionMode.REACT,
            ExecutionMode.SKILL_REACT,
        ):
            logger.warning(
                f"Execution mode {routing_result.execution_mode.value} not yet supported "
                f"in portal REST, falling back to REACT"
            )

        react_config = agent.get_react_config()
        react_engine = getattr(agent, "_react_engine", None)
        if react_engine is None:
            react_engine = ReActEngine(
                llm_gateway=llm_gateway,
                max_steps=react_config["max_steps"],
            )
        else:
            react_engine.reset()

        messages = [{"role": "user", "content": request.message}]
        # Inject conversation history
        history_msgs = await _build_history_messages(conv.id)
        for hm in reversed(history_msgs):
            messages.insert(0, hm)
        tools = agent.get_tools()
        model = agent.get_model()
        system_prompt = getattr(agent, "_system_prompt", None) or agent.get_system_prompt()
        timeout_seconds = react_config["timeout_seconds"]

        collected_output: list[str] = []
        try:
            async for event in react_engine.execute_stream(
                messages=messages,
                tools=tools,
                model=model,
                agent_name=agent.name,
                system_prompt=system_prompt,
                timeout_seconds=timeout_seconds,
            ):
                if event.event_type == "final_answer":
                    collected_output.append(event.data.get("output", ""))
        except asyncio.CancelledError:
            raise
        except Exception as e:
            response_text = f"执行出错: {e}"
        else:
            response_text = _ensure_non_empty(
                "".join(collected_output) if collected_output else None
            )

    await _conversation_store.add_message(conv.id, "assistant", response_text)

    return ChatResponse(
        conversation_id=conv.id,
        message=response_text,
        timestamp=datetime.now(timezone.utc).isoformat(),
        matched_skill=matched_skill,
        routing_method=routing_method,
        confidence=confidence,
        task_id=task_id,
        status="completed",
    )


@router.post("/portal/chat/stream")
async def chat_stream(request: ChatRequest, req: Request, _auth: None = Depends(_verify_api_key)):
    """Stream chat responses via SSE with RequestPreprocessor routing."""
    from sse_starlette.sse import EventSourceResponse

    agent, routing_result, matched_skill, routing_method, confidence = await _resolve_for_chat(
        request, req
    )

    # Create or reuse conversation
    conv = await _conversation_store.get_or_create(request.conversation_id)
    await _conversation_store.add_message(conv.id, "user", request.message)

    llm_gateway = req.app.state.llm_gateway

    async def event_generator():
        # Send routing info as first event
        yield {
            "event": "routing",
            "data": json.dumps(
                {
                    "skill": matched_skill,
                    "method": routing_method,
                    "confidence": confidence,
                }
            ),
        }

        if (
            routing_result is not None
            and routing_result.execution_mode == ExecutionMode.DIRECT_CHAT
        ):
            # DIRECT_CHAT: direct LLM call, no ReAct loop
            chat_messages = []
            if routing_result.system_prompt:
                chat_messages.append({"role": "system", "content": routing_result.system_prompt})
            chat_messages.append({"role": "user", "content": request.message})
            history_msgs = await _build_history_messages(conv.id)
            for hm in history_msgs:
                chat_messages.insert(-1, hm)
            response = await llm_gateway.chat(
                messages=chat_messages,
                model=routing_result.model or "default",
                agent_name="default",
                task_type="chat",
            )
            response_text = _ensure_non_empty(response.content)
            await _conversation_store.add_message(conv.id, "assistant", response_text)
            yield {
                "event": "final_answer",
                "data": json.dumps(
                    {
                        "step": 0,
                        "data": {"output": response_text},
                        "timestamp": datetime.now(timezone.utc).isoformat(),
                    }
                ),
            }
        else:
            # REACT / SKILL_REACT / REWOO / REFLEXION / PLAN_EXEC / TEAM_COLLAB
            # Advanced modes fall back to REACT with a warning.
            if routing_result is not None and routing_result.execution_mode not in (
                ExecutionMode.REACT,
                ExecutionMode.SKILL_REACT,
            ):
                logger.warning(
                    f"Execution mode {routing_result.execution_mode.value} not yet supported "
                    f"in portal SSE, falling back to REACT"
                )

            react_config = agent.get_react_config()
            react_engine = getattr(agent, "_react_engine", None)
            if react_engine is None:
                react_engine = ReActEngine(
                    llm_gateway=llm_gateway,
                    max_steps=react_config["max_steps"],
                )
            else:
                react_engine.reset()

            messages = [{"role": "user", "content": request.message}]
            tools = agent.get_tools()
            model = agent.get_model()
            system_prompt = getattr(agent, "_system_prompt", None) or agent.get_system_prompt()
            timeout_seconds = react_config["timeout_seconds"]

            collected_output: list[str] = []
            try:
                async for event in react_engine.execute_stream(
                    messages=messages,
                    tools=tools,
                    model=model,
                    agent_name=agent.name,
                    system_prompt=system_prompt,
                    timeout_seconds=timeout_seconds,
                ):
                    if event.event_type == "final_answer":
                        collected_output.append(event.data.get("output", ""))
                    yield {
                        "event": event.event_type,
                        "data": json.dumps(
                            {
                                "step": event.step,
                                "data": event.data,
                                "timestamp": event.timestamp,
                            }
                        ),
                    }
            except asyncio.CancelledError:
                raise
            except Exception as e:
                yield {
                    "event": "error",
                    "data": json.dumps({"error": str(e)}),
                }
                return

            response_text = _ensure_non_empty(
                "".join(collected_output) if collected_output else None
            )
            await _conversation_store.add_message(conv.id, "assistant", response_text)

    return EventSourceResponse(event_generator())


@router.get("/portal/capabilities", response_model=CapabilitiesResponse)
async def get_capabilities(req: Request, _auth: None = Depends(_verify_api_key)):
    """List all available capabilities with their status."""
    skill_registry = req.app.state.skill_registry
    all_skills = skill_registry.list_skills()

    # Build a map of capability tag -> skill count
    cap_skill_counts: dict[str, int] = {}
    for skill in all_skills:
        for cap in skill.capabilities:
            cap_skill_counts[cap.tag] = cap_skill_counts.get(cap.tag, 0) + 1
        # Also count the skill itself toward "skills" category
        cap_skill_counts["skills"] = cap_skill_counts.get("skills", 0) + 1

    capabilities: list[CapabilityInfo] = []
    for cat_name, cat_info in CAPABILITY_CATEGORIES.items():
        skill_count = cap_skill_counts.get(cat_name, 0)
        capabilities.append(
            CapabilityInfo(
                name=cat_name,
                display_name=cat_info["display_name"],
                description=cat_info["description"],
                icon=cat_info["icon"],
                enabled=True,
                skill_count=skill_count,
            )
        )

    return CapabilitiesResponse(capabilities=capabilities)


@router.get("/portal/conversations")
async def list_conversations(limit: int = 20, _auth: None = Depends(_verify_api_key)):
    """List recent conversations.

    For each conversation, derive the title from the first user message
    read directly from SQLite (independent of the in-memory cache, which
    may have an empty `messages` list after a restart). This prevents the
    regression where titles collapse to the placeholder "对话".

    Also tags each conversation with ``is_board`` so the sidebar can show
    a "私董会" badge without having to fetch every conversation's full
    history. The check is a cheap metadata LIKE query — the list
    endpoint stays O(limit) even for hundreds of conversations.
    """
    convs = await _conversation_store.list_conversations(limit=limit)
    result: list[dict] = []
    for c in convs:
        # Re-derive title from the persisted user message so cache misses
        # after a restart don't surface the default placeholder.
        first_user = await _conversation_store.get_first_user_message(c.id)
        title = _derive_conversation_title_from_content(first_user.content if first_user else None)
        is_board = await _conversation_has_board_started(c.id)
        result.append(
            {
                "id": c.id,
                "title": title,
                "created_at": c.created_at.isoformat(),
                "updated_at": c.updated_at.isoformat(),
                "message_count": len(c.messages),
                "is_board": is_board,
            }
        )
    return result


async def _conversation_has_board_started(conversation_id: str) -> bool:
    """Return True if the conversation contains a persisted board_started event.

    Used by the sidebar list so it can render the "私董会" badge without
    fetching the full message history. Reads from SQLite directly — the
    in-memory cache may not be populated after a server restart.

    Returns False on any storage error so the badge never blocks the
    list endpoint.
    """
    try:
        return await _conversation_store.has_message_with_type(
            conversation_id, "board_started"
        )
    except (ConnectionError, OSError, asyncio.TimeoutError, ValueError, KeyError, RuntimeError):
        logger.warning("is_board lookup failed for %s", conversation_id, exc_info=True)
        return False


def _derive_conversation_title(conv: Conversation) -> str:
    """Derive a human-readable title from the first user message in the conversation object."""
    for msg in conv.messages:
        if msg.role == "user" and msg.content:
            return msg.content[:20] + ("..." if len(msg.content) > 20 else "")
    return "对话"


_COMMAND_PREFIX_RE = re.compile(
    r"^@(?:board|team)(?::[^\s]+)?(?:\s+rounds=\d+)?\s*",
    re.IGNORECASE,
)


def _strip_command_prefix(content: str) -> str:
    """Strip a leading @board/@team command so conversation titles show only the topic.

    Examples:
        "@board:warren,charlie 怎么看 AI" → "怎么看 AI"
        "@team 私董会" → "私董会"
        "@board rounds=3 软件行业" → "软件行业"
    """
    if not content:
        return ""
    return _COMMAND_PREFIX_RE.sub("", content, count=1).strip()


def _derive_conversation_title_from_content(content: str | None) -> str:
    """Derive title from a string content (used when conv.messages is empty)."""
    if content:
        cleaned = _strip_command_prefix(content)
        if cleaned:
            return cleaned[:20] + ("..." if len(cleaned) > 20 else "")
    return "对话"


@router.get("/portal/conversations/{conversation_id}")
async def get_conversation(
    conversation_id: str, limit: int = 50, _auth: None = Depends(_verify_api_key)
):
    """Get conversation history from SQLite-backed store.

    Title is derived from the first user message in `history` (the
    authoritative source) rather than the in-memory cache, which may have
    an empty `messages` list after a server restart. This prevents the
    regression where selecting a conversation collapses the title to "对话".
    """
    history = await _conversation_store.get_history(conversation_id, limit=limit)
    if not history:
        raise HTTPException(status_code=404, detail=f"Conversation '{conversation_id}' not found")
    conv = await _conversation_store.get_or_create(conversation_id)
    first_user_content = next(
        (m.content for m in history if m.role == "user" and m.content),
        None,
    )
    return {
        "id": conv.id,
        "title": _derive_conversation_title_from_content(first_user_content),
        "messages": [_hydrate_persisted_message(conv.id, i, m) for i, m in enumerate(history)],
        "created_at": conv.created_at.isoformat(),
        "updated_at": conv.updated_at.isoformat(),
        "is_board": any(
            (m.metadata or {}).get("message_type") == "board_started"
            for m in history
        ),
    }


# Fields we store inside Message.metadata to reconstruct board_* messages
# after a page reload. The Message dataclass only has role/content/timestamp
# as first-class columns; everything else (message_type, expert identity,
# board round/role, conclusion payload) rides along in metadata.
_PERSISTED_MESSAGE_FIELDS = (
    "message_type",
    "expert_id",
    "expert_name",
    "expert_avatar",
    "expert_color",
    "board_round",
    "board_role",
    "board_conclusion",
    "board_started",
    "matched_skill",
    "confidence",
    "routing_method",
    "thinking",
    "tool_calls",
)


def _hydrate_persisted_message(conv_id: str, index: int, msg) -> dict:
    """Build the API response dict for a single persisted message.

    Promotes well-known rendering fields from ``msg.metadata`` to the
    top level so the frontend can render board_speech / board_summary /
    board_conclusion cards after a reload — without this, every restored
    assistant message would look like a plain chat bubble.
    """
    payload: dict = {
        "id": f"{conv_id}-{index}",
        "role": msg.role,
        "content": msg.content,
        "timestamp": msg.timestamp.isoformat(),
        "metadata": getattr(msg, "metadata", None) or {},
    }
    meta = payload["metadata"]
    if not isinstance(meta, dict):
        return payload
    for key in _PERSISTED_MESSAGE_FIELDS:
        if key in meta and meta[key] is not None:
            payload[key] = meta[key]
    return payload


@router.delete("/portal/conversations/{conversation_id}")
async def delete_conversation(conversation_id: str, _auth: None = Depends(_verify_api_key)):
    """Delete a conversation and all its messages.

    ponytail: IDOR note — portal endpoints use API-key auth (single-tenant
    access model: API key = full access to all conversations). The SQLite
    store has no user_id column, so per-user ownership cannot be enforced
    without a schema migration. If API keys become per-user, add a
    user_id column to conversations + filter DELETE by (id, user_id).
    Upgrade path: migrate portal endpoints to JWT auth + per-user scoping.
    """
    deleted = await _conversation_store.delete_conversation(conversation_id)
    if not deleted:
        raise HTTPException(status_code=404, detail=f"Conversation '{conversation_id}' not found")
    return {"deleted": True, "id": conversation_id}


def _derive_title_from_messages(messages: list) -> str:
    """Derive title from a list of Message objects (SessionManager format)."""
    for msg in messages:
        if msg.role.value == "user" and msg.content:
            return msg.content[:20] + ("..." if len(msg.content) > 20 else "")
    return "对话"


async def _execute_react_background(
    react_engine: ReActEngine,
    messages: list[dict],
    tools: list,
    model: str,
    agent_name: str,
    system_prompt: str | None,
    timeout_seconds: float | None,
    conv_id: str,
    task_id: str,
    event_queue: EventQueue,
    conversation_store: SqliteConversationStore,
    task_store: InMemoryTaskStore | None = None,
) -> None:
    """Execute ReAct engine in the background, decoupled from WebSocket lifecycle.

    Events are emitted to the EventQueue (filtered by task_id) so that any
    subscriber — including a reconnected WebSocket — can consume them.
    Results are always persisted to the conversation store, regardless of
    whether a WebSocket subscriber is active.
    Task status is tracked in TaskStore when provided.
    """
    collected_output: list[str] = []
    try:
        if task_store is not None:
            try:
                await _task_store_update_status(
                    task_store, task_id, TaskStatus.RUNNING, started_at=datetime.now(timezone.utc)
                )
            except (
                ConnectionError,
                OSError,
                asyncio.TimeoutError,
                ValueError,
                KeyError,
                RuntimeError,
            ):
                logger.warning("Failed to update TaskStore RUNNING", exc_info=True)

        async for event in react_engine.execute_stream(
            messages=messages,
            tools=tools,
            model=model,
            agent_name=agent_name,
            system_prompt=system_prompt,
            timeout_seconds=timeout_seconds,
        ):
            if event.event_type == "final_answer":
                collected_output.append(event.data.get("output", ""))

            # P1 #8/#9/#10 fix: Preserve TurnEventType mapping, step field,
            # and original data structure for EQ subscriber compatibility.
            # Note: Event dataclass has no 'step' field; use getattr for
            # compatibility with ReActEngine events that may include it.
            _turn_event_type = _REACT_EVENT_TYPE_MAP.get(event.event_type)
            if _turn_event_type is not None:
                await _emit_event_safe(
                    event_queue,
                    _turn_event_type,
                    task_id=task_id,
                    session_id=conv_id,
                    data={
                        **event.data,
                        "step": getattr(event, "step", 0),
                        "timestamp": event.timestamp,
                    },
                )

        # Normal completion: persist result
        response_text = _ensure_non_empty("".join(collected_output) if collected_output else None)
        await conversation_store.add_message(conv_id, "assistant", response_text)

        if task_store is not None:
            try:
                await _task_store_update_status(
                    task_store,
                    task_id,
                    TaskStatus.COMPLETED,
                    output_data={"output": response_text},
                    completed_at=datetime.now(timezone.utc),
                    progress=1.0,
                    progress_message="Completed",
                )
            except (
                ConnectionError,
                OSError,
                asyncio.TimeoutError,
                ValueError,
                KeyError,
                RuntimeError,
            ):
                logger.warning("Failed to update TaskStore COMPLETED", exc_info=True)

        # Emit task.completed so subscribers know the task is done
        await _emit_event_safe(
            event_queue,
            TaskEventType.TASK_COMPLETED,
            task_id=task_id,
            session_id=conv_id,
            data={"output": response_text, "timestamp": datetime.now(timezone.utc).isoformat()},
        )

    except asyncio.CancelledError:
        # Application shutdown or explicit cancel — persist partial output
        # and mark task as FAILED so resume does not block forever.
        # P0 #1/#2 fix: ALL persistence operations must use asyncio.shield
        # and the async TaskStore shim. Without shield, a re-entrant
        # cancellation kills the cleanup itself; without the shim,
        # RedisTaskStore (async) silently drops the coroutine.
        if collected_output:
            partial = _ensure_non_empty("".join(collected_output))
            try:
                await asyncio.shield(conversation_store.add_message(conv_id, "assistant", partial))
            except (
                asyncio.CancelledError,
                ConnectionError,
                OSError,
                asyncio.TimeoutError,
                ValueError,
                KeyError,
                RuntimeError,
            ):
                logger.warning("Failed to persist partial output on cancel")
        if task_store is not None:
            try:
                await asyncio.shield(
                    _task_store_update_status(
                        task_store,
                        task_id,
                        TaskStatus.FAILED,
                        error_message="Task cancelled",
                        completed_at=datetime.now(timezone.utc),
                    )
                )
            except (
                asyncio.CancelledError,
                ConnectionError,
                OSError,
                asyncio.TimeoutError,
                ValueError,
                KeyError,
                RuntimeError,
            ):
                logger.warning("Failed to update TaskStore on cancel", exc_info=True)
        # P0 #2 fix: _emit_event_safe is async (it awaits event_queue.emit).
        # Shield it so a re-entrant CancelledError doesn't kill the emit
        # and leave subscribers blocked until timeout.
        try:
            await asyncio.shield(
                _emit_event_safe(
                    event_queue,
                    TaskEventType.TASK_FAILED,
                    task_id=task_id,
                    session_id=conv_id,
                    data={
                        "error": "Task cancelled",
                        "timestamp": datetime.now(timezone.utc).isoformat(),
                    },
                )
            )
        except (asyncio.CancelledError, asyncio.QueueFull, RuntimeError, ConnectionError):
            logger.warning("Failed to emit TASK_FAILED on cancel")
        raise  # Propagate cancellation

    except Exception as e:
        # Persist any partial output collected before the error
        if collected_output:
            partial = _ensure_non_empty("".join(collected_output))
            try:
                await conversation_store.add_message(conv_id, "assistant", partial)
            except (
                ConnectionError,
                OSError,
                asyncio.TimeoutError,
                ValueError,
                KeyError,
                RuntimeError,
            ):
                logger.warning("Failed to persist partial output in background task")

        if task_store is not None:
            try:
                await _task_store_update_status(
                    task_store,
                    task_id,
                    TaskStatus.FAILED,
                    error_message=str(e),
                    completed_at=datetime.now(timezone.utc),
                )
            except (
                ConnectionError,
                OSError,
                asyncio.TimeoutError,
                ValueError,
                KeyError,
                RuntimeError,
            ):
                logger.warning("Failed to update TaskStore FAILED", exc_info=True)

        # Emit task.failed so subscribers know the task failed
        await _emit_event_safe(
            event_queue,
            TaskEventType.TASK_FAILED,
            task_id=task_id,
            session_id=conv_id,
            data={"error": str(e), "timestamp": datetime.now(timezone.utc).isoformat()},
        )


@router.websocket("/portal/ws")
async def portal_websocket(websocket: WebSocket):
    """Real-time chat WebSocket endpoint."""
    await websocket.accept()

    # ponytail: ws_user_id must be initialized before any early return — the
    # finally block below references it. Previously the api_key reject path
    # returned before assignment, causing UnboundLocalError that masked the
    # original auth error. Upgrade path: refactor auth into a decorator.
    ws_user_id: str | None = None

    # Authentication (after accept, since FastAPI requires accept before close)
    configured_api_key: str | None = None
    if hasattr(websocket.app.state, "server_config") and websocket.app.state.server_config:
        configured_api_key = websocket.app.state.server_config.api_key
    if configured_api_key is None and hasattr(websocket.app.state, "api_key"):
        configured_api_key = websocket.app.state.api_key

    # Check api_key query param
    if configured_api_key:
        provided = websocket.query_params.get("api_key")
        if not hmac.compare_digest((provided or "").encode(), configured_api_key.encode()):
            await websocket.send_json(
                {"type": "error", "data": {"message": "Invalid or missing api_key"}}
            )
            await websocket.close(code=4001, reason="Invalid or missing api_key")
            return

    # Track authenticated portal connections for user-scoped push (calendar
    # reminders, etc.). user_id is None for API-key / dev-mode clients.
    current_user = getattr(websocket.state, "current_user", None) or {}
    ws_user_id = current_user.get("user_id")
    if ws_user_id:
        portal_connection_manager.add(ws_user_id, websocket)

    # Wait for first chat message before creating conversation
    conv: Conversation | None = None
    # task_id is per-user-message; tracked here so the outer except can emit task.failed
    task_id: str | None = None
    # Track the active background task so cancel can propagate to it.
    active_bg_task: asyncio.Task | None = None

    try:
        while True:
            try:
                timeout = _WS_HEARTBEAT_TIMEOUT if _WS_HEARTBEAT_TIMEOUT > 0 else None
                raw = await asyncio.wait_for(websocket.receive_text(), timeout=timeout)
            except asyncio.TimeoutError:
                await websocket.close(code=1000, reason="Heartbeat timeout")
                return

            try:
                msg = json.loads(raw)
            except json.JSONDecodeError:
                continue

            msg_type = msg.get("type")

            if msg_type == "cancel":
                # Cancel the active background task if still running
                if active_bg_task is not None and not active_bg_task.done():
                    active_bg_task.cancel()
                    active_bg_task = None
                await websocket.send_json(
                    {
                        "type": "result",
                        "data": {
                            "status": "cancelled",
                            "timestamp": datetime.now(timezone.utc).isoformat(),
                        },
                    }
                )
                return

            if msg_type == "ping":
                await websocket.send_json({"type": "pong"})
                continue

            if msg_type == "resume":
                # Frontend reconnected and wants to resume a running task
                resume_task_id = msg.get("task_id", "")
                if not resume_task_id:
                    continue

                # P1 #3/#4 fix: Fail-closed ownership verification.
                # Require conversation_id and TaskStore — reject if either
                # is missing, to prevent cross-conversation task hijacking
                # via empty conversation_id or unconfigured TaskStore.
                resume_conv_id = msg.get("conversation_id", "")
                if not resume_conv_id:
                    await websocket.send_json(
                        {
                            "type": "error",
                            "data": {
                                "message": "Resume requires conversation_id.",
                                "task_id": resume_task_id,
                            },
                        }
                    )
                    continue

                resume_task_store: InMemoryTaskStore | None = getattr(
                    websocket.app.state, "task_store", None
                )
                resume_eq: EventQueue | None = getattr(websocket.app.state, "event_queue", None)

                # P1 #4: Fail-closed if TaskStore is unavailable — cannot
                # verify ownership without it.
                if resume_task_store is None:
                    await websocket.send_json(
                        {
                            "type": "error",
                            "data": {
                                "message": "Resume not supported (TaskStore unavailable). Please retry your request.",
                                "task_id": resume_task_id,
                            },
                        }
                    )
                    continue

                try:
                    record = await _task_store_get(resume_task_store, resume_task_id)
                except (
                    ConnectionError,
                    OSError,
                    asyncio.TimeoutError,
                    ValueError,
                    KeyError,
                    RuntimeError,
                ):
                    logger.warning("TaskStore.get failed during resume", exc_info=True)
                    record = None
                if record is not None:
                    # P1 #3: Fail-closed ownership check — reject if
                    # conversation_id is missing from task metadata OR
                    # does not match the request.
                    task_conv_id = (record.metadata or {}).get("conversation_id", "")
                    if not task_conv_id or resume_conv_id != task_conv_id:
                        logger.warning(
                            "Resume rejected: conversation_id mismatch "
                            "(task=%s, request=%s, task_id=%s)",
                            task_conv_id,
                            resume_conv_id,
                            resume_task_id,
                        )
                        await websocket.send_json(
                            {
                                "type": "error",
                                "data": {
                                    "message": "Task does not belong to this conversation.",
                                    "task_id": resume_task_id,
                                },
                            }
                        )
                        continue
                    if record.status == TaskStatus.COMPLETED:
                        # Task already finished — send result immediately
                        output = (record.output_data or {}).get("output", "")
                        await websocket.send_json(
                            {
                                "type": "result",
                                "data": {
                                    "message": output,
                                    "timestamp": record.completed_at.isoformat()
                                    if record.completed_at
                                    else datetime.now(timezone.utc).isoformat(),
                                },
                            }
                        )
                        continue
                    elif record.status == TaskStatus.FAILED:
                        await websocket.send_json(
                            {
                                "type": "error",
                                "data": {
                                    "message": record.error_message or "Task failed",
                                },
                            }
                        )
                        continue
                else:
                    # Task not found in store — cannot resume
                    await websocket.send_json(
                        {
                            "type": "error",
                            "data": {
                                "message": "Task not found or has expired. Please retry your request.",
                                "task_id": resume_task_id,
                            },
                        }
                    )
                    continue

                # Task is still running — subscribe to EventQueue for remaining events.
                # H6: if EventQueue is unavailable, inform the client instead of
                # silently continuing (which would leave the UI loading forever).
                if resume_eq is None:
                    await websocket.send_json(
                        {
                            "type": "error",
                            "data": {
                                "message": "Resume not supported (EventQueue unavailable). Please retry your request.",
                            },
                        }
                    )
                    continue

                # C2: bound the subscribe loop with a timeout so a dead
                # background task cannot block resume forever.
                resume_timeout = _WS_HEARTBEAT_TIMEOUT * 10 if _WS_HEARTBEAT_TIMEOUT > 0 else 600
                try:
                    async with asyncio.timeout(resume_timeout):
                        async for event in resume_eq.subscribe(task_id=resume_task_id):
                            if event.event_type == TaskEventType.TASK_COMPLETED:
                                response_text = event.data.get("output", EMPTY_LLM_RESPONSE)
                                await websocket.send_json(
                                    {
                                        "type": "result",
                                        "data": {
                                            "message": response_text,
                                            "timestamp": event.data.get(
                                                "timestamp",
                                                datetime.now(timezone.utc).isoformat(),
                                            ),
                                        },
                                    }
                                )
                                break
                            elif event.event_type == TaskEventType.TASK_FAILED:
                                await websocket.send_json(
                                    {
                                        "type": "error",
                                        "data": {
                                            "message": event.data.get("error", "Unknown error"),
                                        },
                                    }
                                )
                                break
                            else:
                                # P1 #8/#10 fix: step and data are now
                                # top-level fields in event.data.
                                await websocket.send_json(
                                    {
                                        "type": "step",
                                        "data": {
                                            "event_type": event.event_type,
                                            "step": event.data.get("step", 0),
                                            "data": {
                                                k: v
                                                for k, v in event.data.items()
                                                if k not in ("step", "timestamp")
                                            },
                                            "timestamp": event.data.get("timestamp", ""),
                                        },
                                    }
                                )
                except TimeoutError:
                    logger.warning(f"Resume subscribe timed out for task {resume_task_id}")
                    await websocket.send_json(
                        {
                            "type": "error",
                            "data": {
                                "message": "Task resume timed out. Please retry your request.",
                                "task_id": resume_task_id,
                            },
                        }
                    )
                except RuntimeError as exc:
                    # P1 #5: subscriber limit reached or EQ closed — send
                    # a friendly error instead of terminating the connection.
                    logger.warning("Resume subscribe failed for task %s: %s", resume_task_id, exc)
                    await websocket.send_json(
                        {
                            "type": "error",
                            "data": {
                                "message": "Server busy, please retry shortly.",
                                "task_id": resume_task_id,
                            },
                        }
                    )
                continue

            if msg_type != "chat":
                continue

            message_text = msg.get("message", "")
            model_override = msg.get("model")  # Frontend model selector

            if not message_text:
                continue

            # Create or switch conversation based on conversation_id from frontend
            conv_id = msg.get("conversation_id")
            if conv_id:
                if conv is None or conv.id != conv_id:
                    conv = await _conversation_store.get_or_create(conv_id)
                    await websocket.send_json({"type": "connected", "conversation_id": conv.id})
            elif conv is None:
                conv = await _conversation_store.get_or_create(conv_id)
                await websocket.send_json({"type": "connected", "conversation_id": conv.id})

            # @board / @team intercept — mirror chat.py:1076-1081.
            # Frontend WS connects to /api/v1/portal/ws (this router), not
            # /api/v1/chat/ws/{session_id} (chat.py). Without this intercept
            # @board/@team messages are treated as plain text and no
            # board_started/team_formed events are broadcast, so the cards
            # never render. Placed before task_id emit so intercepted
            # messages don't leave orphan tasks in the EQ side-channel.
            if await _execute_board_meeting(websocket, conv.id, message_text, _sm_adapter):
                continue
            if await _execute_team_collab(websocket, conv.id, message_text, _sm_adapter):
                continue

            # Generate task_id for this user message and emit task.created to EQ
            # (EQ is a side-channel: emit failures never break the WebSocket flow)
            task_id = str(uuid.uuid4())
            event_queue: EventQueue | None = getattr(websocket.app.state, "event_queue", None)
            task_store: InMemoryTaskStore | None = getattr(websocket.app.state, "task_store", None)
            await _emit_event_safe(
                event_queue,
                TaskEventType.TASK_CREATED,
                task_id=task_id,
                session_id=conv.id,
                data={"message": message_text},
            )

            # Add user message to conversation
            await _conversation_store.add_message(conv.id, "user", message_text)
            start_time = datetime.now(timezone.utc)

            async def _record_experience(
                task_type: str, goal: str, outcome: str, duration_seconds: float
            ) -> None:
                """Record experience to dashboard after chat completion."""
                try:
                    exp = DashboardExperience(
                        id=str(uuid.uuid4()),
                        task_type=task_type,
                        goal=goal[:200],
                        outcome=outcome,
                        duration_seconds=duration_seconds,
                        created_at=datetime.now(timezone.utc),
                    )
                    _dashboard_experiences.append(exp)
                    await _broadcast_dashboard_event(
                        "experience_added",
                        {
                            "id": exp.id,
                            "task_type": exp.task_type,
                            "goal": exp.goal,
                            "outcome": exp.outcome,
                        },
                    )
                    await _broadcast_dashboard_event("metrics_updated", {"period": "7d"})
                except (
                    asyncio.QueueFull,
                    RuntimeError,
                    ConnectionError,
                    ValueError,
                    KeyError,
                ) as e:
                    logger.warning(f"Failed to record experience: {e}")

            # Unified preprocessing via RequestPreprocessor (minimal: @skill prefix + greeting regex + REACT)
            pool = websocket.app.state.agent_pool
            skill_registry = websocket.app.state.skill_registry
            llm_gateway = websocket.app.state.llm_gateway
            request_preprocessor: RequestPreprocessor = websocket.app.state.request_preprocessor

            all_skills = skill_registry.list_skills()

            # Get default tools for RequestPreprocessor
            default_tools = []
            default_system_prompt = None
            default_agent = pool.get_agent("default")
            if default_agent is not None:
                default_tools = default_agent.get_tools()
                default_system_prompt = (
                    getattr(default_agent, "_system_prompt", None)
                    or default_agent.get_system_prompt()
                )
            else:
                for skill in all_skills:
                    agent = pool.get_agent(skill.name)
                    if agent is not None:
                        default_tools = agent.get_tools()
                        default_system_prompt = (
                            getattr(agent, "_system_prompt", None) or agent.get_system_prompt()
                        )
                        break

            # Preprocess via RequestPreprocessor (minimal: @skill prefix + greeting regex + REACT)
            routing_result = await request_preprocessor.preprocess(
                content=message_text,
                skill_registry=skill_registry,
                default_tools=default_tools,
                default_system_prompt=default_system_prompt,
                default_model=model_override or "default",
                default_agent_name="default",
            )

            await websocket.send_json(
                {
                    "type": "routing",
                    "skill": routing_result.agent_name or "default",
                    "method": routing_result.match_method or "intent",
                    "confidence": routing_result.match_confidence,
                }
            )

            # Emit task.started to EQ (execution begins after routing)
            await _emit_event_safe(
                event_queue,
                TaskEventType.TASK_STARTED,
                task_id=task_id,
                session_id=conv.id,
                data={
                    "agent_name": routing_result.agent_name or "default",
                    "execution_mode": routing_result.execution_mode.value
                    if hasattr(routing_result.execution_mode, "value")
                    else str(routing_result.execution_mode),
                },
            )

            # Register task in TaskStore for status tracking and recovery
            if task_store is not None:
                try:
                    await _task_store_create(
                        task_store,
                        task_id=task_id,
                        agent_name=routing_result.agent_name or "default",
                        input_data={"message": message_text},
                        skill_name=routing_result.skill_name,
                    )
                    # Store conversation_id in metadata for frontend recovery
                    await _task_store_update_status(
                        task_store,
                        task_id,
                        TaskStatus.PENDING,
                        metadata={"conversation_id": conv.id},
                    )
                except (
                    ConnectionError,
                    OSError,
                    asyncio.TimeoutError,
                    ValueError,
                    KeyError,
                    RuntimeError,
                ):
                    logger.warning("Failed to register task in TaskStore", exc_info=True)

            # Execute based on routing result's execution_mode
            # This is the single source of truth for path selection,
            # replacing fragile string-matching on match_method.
            if routing_result.execution_mode == ExecutionMode.DIRECT_CHAT:
                # Zero-cost path: direct LLM call, no ReAct loop
                chat_messages = []
                # Inject system prompt (contains SOUL/USER/MEMORY/DAILY) for identity continuity
                if routing_result.system_prompt:
                    chat_messages.append(
                        {"role": "system", "content": routing_result.system_prompt}
                    )
                chat_messages.append({"role": "user", "content": message_text})
                # Inject conversation history for context continuity
                history_msgs = await _build_history_messages(conv.id)
                for hm in history_msgs:
                    chat_messages.insert(-1, hm)
                response = await llm_gateway.chat(
                    messages=chat_messages,
                    model=model_override or "default",
                    agent_name="default",
                    task_type="chat",
                )
                # Store assistant reply for multi-turn context continuity
                response_content = _ensure_non_empty(response.content)
                await _conversation_store.add_message(conv.id, "assistant", response_content)

                # Update TaskStore status to COMPLETED
                if task_store is not None:
                    try:
                        await _task_store_update_status(
                            task_store,
                            task_id,
                            TaskStatus.COMPLETED,
                            output_data={"output": response_content},
                            completed_at=datetime.now(timezone.utc),
                            progress=1.0,
                            progress_message="Completed",
                        )
                    except (
                        ConnectionError,
                        OSError,
                        asyncio.TimeoutError,
                        ValueError,
                        KeyError,
                        RuntimeError,
                    ):
                        logger.warning("Failed to update TaskStore for DIRECT_CHAT", exc_info=True)

                # Emit turn.final_answer and task.completed to EQ
                await _emit_event_safe(
                    event_queue,
                    TurnEventType.FINAL_ANSWER,
                    task_id=task_id,
                    session_id=conv.id,
                    data={"output": response_content},
                )
                await _emit_event_safe(
                    event_queue,
                    TaskEventType.TASK_COMPLETED,
                    task_id=task_id,
                    session_id=conv.id,
                    data={"output": response_content},
                )

                await websocket.send_json(
                    {
                        "type": "result",
                        "data": {
                            "message": response_content,
                            "timestamp": datetime.now(timezone.utc).isoformat(),
                        },
                    }
                )
                await _record_experience(
                    "chat",
                    message_text,
                    "success",
                    (datetime.now(timezone.utc) - start_time).total_seconds(),
                )
                continue

            # REACT / SKILL_REACT / REWOO / REFLEXION / PLAN_EXEC / TEAM_COLLAB
            # Advanced modes fall back to REACT with a warning.
            if routing_result.execution_mode not in (
                ExecutionMode.REACT,
                ExecutionMode.SKILL_REACT,
            ):
                logger.warning(
                    f"Execution mode {routing_result.execution_mode.value} not yet supported "
                    f"in portal WebSocket, falling back to REACT"
                )

            agent_name = routing_result.agent_name or "default"
            agent = pool.get_agent(agent_name)
            if agent is None:
                # Agent not in pool — fall back to direct chat.
                # This handles the case where routing returned an agent_name
                # that doesn't exist in the pool (e.g. "default" or a
                # skill that hasn't been instantiated yet).
                logger.info(
                    f"Session {conv.id}: agent '{agent_name}' not in pool, falling back to direct chat"
                )
                chat_messages = []
                # Inject system prompt (contains SOUL/USER/MEMORY/DAILY) for identity continuity
                if routing_result.system_prompt:
                    chat_messages.append(
                        {"role": "system", "content": routing_result.system_prompt}
                    )
                chat_messages.append({"role": "user", "content": message_text})
                try:
                    history = await _conversation_store.get_history(conv.id, limit=20)
                    for hist_msg in history[:-1]:
                        if hist_msg.role in ("user", "assistant"):
                            chat_messages.insert(
                                -1, {"role": hist_msg.role, "content": hist_msg.content}
                            )
                except (
                    ConnectionError,
                    OSError,
                    asyncio.TimeoutError,
                    ValueError,
                    KeyError,
                    RuntimeError,
                ):
                    pass
                response = await llm_gateway.chat(
                    messages=chat_messages,
                    model=model_override or "default",
                    agent_name="default",
                    task_type="chat",
                )
                # Store assistant reply for multi-turn context continuity
                response_content = _ensure_non_empty(response.content)
                await _conversation_store.add_message(conv.id, "assistant", response_content)

                # Emit turn.final_answer and task.completed to EQ (fallback path)
                await _emit_event_safe(
                    event_queue,
                    TurnEventType.FINAL_ANSWER,
                    task_id=task_id,
                    session_id=conv.id,
                    data={"output": response_content},
                )
                await _emit_event_safe(
                    event_queue,
                    TaskEventType.TASK_COMPLETED,
                    task_id=task_id,
                    session_id=conv.id,
                    data={"output": response_content},
                )

                await websocket.send_json(
                    {
                        "type": "result",
                        "data": {
                            "status": "completed",
                            "content": response_content,
                            "timestamp": datetime.now(timezone.utc).isoformat(),
                        },
                    }
                )
                await _record_experience(
                    "chat",
                    message_text,
                    "success",
                    (datetime.now(timezone.utc) - start_time).total_seconds(),
                )
                continue

            # Execute via ReAct stream
            react_config = agent.get_react_config()
            # Reuse agent's ReActEngine if available (aligned with chat.py pattern)
            react_engine = getattr(agent, "_react_engine", None)
            if react_engine is None:
                react_engine = ReActEngine(
                    llm_gateway=llm_gateway,
                    max_steps=react_config["max_steps"],
                )
            else:
                react_engine.reset()

            messages = [{"role": "user", "content": message_text}]
            # Inject conversation history for context continuity
            history_msgs = await _build_history_messages(conv.id)
            for hm in reversed(history_msgs):
                messages.insert(0, hm)
            tools = agent.get_tools()
            model = model_override or agent.get_model()
            system_prompt = getattr(agent, "_system_prompt", None) or agent.get_system_prompt()
            timeout_seconds = react_config["timeout_seconds"]
            logger.info(
                f"[portal] agent='{agent_name}' tools={len(tools)} "
                f"[{', '.join(t.name for t in tools)}] model={model}"
            )

            # Start ReAct execution as a background task, decoupled from
            # WebSocket lifecycle.  When the WebSocket disconnects, the
            # background task continues running and persists the result.
            bg_task = asyncio.create_task(
                _execute_react_background(
                    react_engine=react_engine,
                    messages=messages,
                    tools=tools,
                    model=model,
                    agent_name=agent.name,
                    system_prompt=system_prompt,
                    timeout_seconds=timeout_seconds,
                    conv_id=conv.id,
                    task_id=task_id,
                    event_queue=event_queue,
                    conversation_store=_conversation_store,
                    task_store=task_store,
                )
            )
            _running_background_tasks.add(bg_task)
            bg_task.add_done_callback(_running_background_tasks.discard)
            active_bg_task = bg_task

            # C1 guard: EventQueue is required for subscribe; fall back to
            # awaiting the background task directly if unavailable.
            if event_queue is None:
                logger.warning("EventQueue not configured; awaiting background task directly")
                try:
                    await bg_task
                except (RuntimeError, ConnectionError, asyncio.TimeoutError):
                    pass  # errors handled inside _execute_react_background
                active_bg_task = None
                continue

            # Subscribe to EventQueue (filtered by task_id) and forward
            # events to the WebSocket.  When the WebSocket disconnects,
            # this loop exits but the background task continues.
            # P1 #7 fix: bound the subscribe loop with a timeout so a
            # hung background task cannot block the WebSocket forever.
            # Matches the resume path's timeout strategy.
            _subscribe_timeout = _WS_HEARTBEAT_TIMEOUT * 10 if _WS_HEARTBEAT_TIMEOUT > 0 else 600
            try:
                async with asyncio.timeout(_subscribe_timeout):
                    async for event in event_queue.subscribe(task_id=task_id):
                        if event.event_type == TaskEventType.TASK_COMPLETED:
                            response_text = event.data.get("output", EMPTY_LLM_RESPONSE)
                            await websocket.send_json(
                                {
                                    "type": "result",
                                    "data": {
                                        "message": response_text,
                                        "timestamp": event.data.get(
                                            "timestamp",
                                            datetime.now(timezone.utc).isoformat(),
                                        ),
                                    },
                                }
                            )
                            await _record_experience(
                                routing_result.skill_name or "agent",
                                message_text,
                                "success" if response_text != EMPTY_LLM_RESPONSE else "failure",
                                (datetime.now(timezone.utc) - start_time).total_seconds(),
                            )
                            break
                        elif event.event_type == TaskEventType.TASK_FAILED:
                            await websocket.send_json(
                                {
                                    "type": "error",
                                    "data": {
                                        "message": event.data.get("error", "Unknown error"),
                                    },
                                }
                            )
                            await _record_experience(
                                routing_result.skill_name or "agent",
                                message_text,
                                "failure",
                                (datetime.now(timezone.utc) - start_time).total_seconds(),
                            )
                            break
                        else:
                            # Forward ReAct events as step messages.
                            # P1 #8/#10 fix: step and data are now top-level
                            # fields in event.data (no longer nested).
                            await websocket.send_json(
                                {
                                    "type": "step",
                                    "data": {
                                        "event_type": event.event_type,
                                        "step": event.data.get("step", 0),
                                        "data": {
                                            k: v
                                            for k, v in event.data.items()
                                            if k not in ("step", "timestamp")
                                        },
                                        "timestamp": event.data.get("timestamp", ""),
                                    },
                                }
                            )
            except TimeoutError:
                logger.warning(f"Subscribe loop timed out for task {task_id}")
                if active_bg_task is not None and not active_bg_task.done():
                    active_bg_task.cancel()
                await websocket.send_json(
                    {
                        "type": "error",
                        "data": {
                            "message": "Task timed out. Please retry your request.",
                            "task_id": task_id,
                        },
                    }
                )
            except RuntimeError as exc:
                # P1 #5: subscriber limit reached or EQ closed — send
                # a friendly error instead of terminating the connection.
                logger.warning("Subscribe failed for task %s: %s", task_id, exc)
                await websocket.send_json(
                    {
                        "type": "error",
                        "data": {
                            "message": "Server busy, please retry shortly.",
                            "task_id": task_id,
                        },
                    }
                )

    except WebSocketDisconnect:
        logger.debug(f"Portal WebSocket disconnected for conversation {conv.id if conv else 'N/A'}")
        # P0 fix: Do NOT cancel the background task on disconnect.
        # The entire purpose of the three-layer defense is to let the
        # background task continue running and persist the result so the
        # frontend can resume it after reconnection. Cancelling here would
        # kill the task, lose the full output, and mark it FAILED —
        # defeating layers 2 and 3. The task is only cancelled on explicit
        # user cancel (msg_type == 'cancel') or application shutdown.
    except asyncio.CancelledError:
        raise
    except Exception as e:
        logger.error(f"Portal WebSocket error: {e}")
        # P1 #6 fix: Do NOT cancel the background task on connection-level
        # errors (ConnectionResetError, BrokenPipeError, etc.). These are
        # functionally equivalent to WebSocketDisconnect — the client dropped
        # — and the background task must survive to persist its result.
        # Only cancel on truly unexpected errors that may have corrupted
        # state needed by the background task.
        if not isinstance(e, (ConnectionResetError, BrokenPipeError, ConnectionError)):
            if active_bg_task is not None and not active_bg_task.done():
                active_bg_task.cancel()
        # Emit task.failed to EQ if a task was in progress
        # (task_id is set when a user message is received; None before that)
        if task_id is not None and conv is not None:
            event_queue = getattr(websocket.app.state, "event_queue", None)
            await _emit_event_safe(
                event_queue,
                TaskEventType.TASK_FAILED,
                task_id=task_id,
                session_id=conv.id,
                data={"error": str(e)},
            )
        try:
            await websocket.send_json({"type": "error", "data": {"message": str(e)}})
        except (ConnectionError, RuntimeError, asyncio.TimeoutError):
            pass
    finally:
        # Remove from user-scoped push tracking on any disconnect/error/return.
        if ws_user_id:
            portal_connection_manager.remove(ws_user_id, websocket)