geo/backend/app/models/knowledge.py

import uuid
from datetime import datetime

from sqlalchemy import String, Integer, ForeignKey, Index, func, Text, DateTime
from sqlalchemy import Uuid
from sqlalchemy.orm import Mapped, mapped_column, relationship

from app.database import Base, JSONType

# pgvector Vector type - imported conditionally
try:
    from pgvector.sqlalchemy import Vector
    _VECTOR_AVAILABLE = True
except ImportError:
    # pgvector package not installed; Vector columns will be skipped in ORM
    # The migration file handles the actual column creation via raw SQL
    Vector = None
    _VECTOR_AVAILABLE = False


class KnowledgeBase(Base):
    __tablename__ = "knowledge_bases"

    id: Mapped[uuid.UUID] = mapped_column(
        Uuid(as_uuid=True),
        primary_key=True,
        default=uuid.uuid4,
    )
    organization_id: Mapped[uuid.UUID] = mapped_column(
        Uuid(as_uuid=True),
        ForeignKey("organizations.id", ondelete="CASCADE"),
        nullable=False,
    )
    name: Mapped[str] = mapped_column(String(200), nullable=False)
    type: Mapped[str] = mapped_column(String(20), nullable=False)  # "industry" / "enterprise"
    description: Mapped[str | None] = mapped_column(Text, nullable=True)
    document_count: Mapped[int] = mapped_column(Integer, server_default="0", nullable=False)
    status: Mapped[str] = mapped_column(String(20), server_default="active", nullable=False)
    created_by: Mapped[str | None] = mapped_column(
        String(36),
        ForeignKey("users.id", ondelete="SET NULL"),
        nullable=True,
    )
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True),
        server_default=func.now(),
        nullable=False,
    )
    updated_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True),
        server_default=func.now(),
        onupdate=func.now(),
        nullable=False,
    )

    # Relationships
    organization: Mapped["Organization"] = relationship(
        "Organization", back_populates="knowledge_bases"
    )
    creator: Mapped["User"] = relationship(
        "User", foreign_keys=[created_by]
    )
    documents: Mapped[list["KnowledgeDocument"]] = relationship(
        "KnowledgeDocument", back_populates="knowledge_base", cascade="all, delete-orphan"
    )

    __table_args__ = (
        Index("idx_knowledge_bases_organization_id", "organization_id"),
        Index("idx_knowledge_bases_type", "type"),
        Index("idx_knowledge_bases_status", "status"),
    )


class KnowledgeDocument(Base):
    __tablename__ = "knowledge_documents"

    id: Mapped[uuid.UUID] = mapped_column(
        Uuid(as_uuid=True),
        primary_key=True,
        default=uuid.uuid4,
    )
    knowledge_base_id: Mapped[uuid.UUID] = mapped_column(
        Uuid(as_uuid=True),
        ForeignKey("knowledge_bases.id", ondelete="CASCADE"),
        nullable=False,
    )
    title: Mapped[str] = mapped_column(String(500), nullable=False)
    source_type: Mapped[str] = mapped_column(String(20), nullable=False)  # "text" / "url" / "pdf" / "markdown"
    source_url: Mapped[str | None] = mapped_column(String(2000), nullable=True)
    content: Mapped[str] = mapped_column(Text, nullable=False)
    content_hash: Mapped[str] = mapped_column(String(64), nullable=False)
    chunk_count: Mapped[int] = mapped_column(Integer, server_default="0", nullable=False)
    status: Mapped[str] = mapped_column(String(20), server_default="processing", nullable=False)  # "processing" / "ready" / "failed"
    error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
    # mapped_column("metadata") to avoid SQLAlchemy reserved keyword conflict
    extra_metadata: Mapped[dict | None] = mapped_column("metadata", JSONType, nullable=True)
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True),
        server_default=func.now(),
        nullable=False,
    )
    updated_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True),
        server_default=func.now(),
        onupdate=func.now(),
        nullable=False,
    )

    # Relationships
    knowledge_base: Mapped["KnowledgeBase"] = relationship(
        "KnowledgeBase", back_populates="documents"
    )
    chunks: Mapped[list["KnowledgeChunk"]] = relationship(
        "KnowledgeChunk", back_populates="document", cascade="all, delete-orphan"
    )

    __table_args__ = (
        Index("idx_knowledge_documents_knowledge_base_id", "knowledge_base_id"),
        Index("idx_knowledge_documents_status", "status"),
        Index("idx_knowledge_documents_content_hash", "content_hash"),
    )


# Conditionally build KnowledgeChunk with or without Vector column
def _build_chunk_columns():
    """Return embedding column definition based on pgvector availability."""
    if _VECTOR_AVAILABLE:
        return mapped_column(Vector(1536), nullable=True)
    else:
        # Fallback: use Text to store serialized embeddings (not for production use)
        # The actual vector column is created by the migration via raw SQL
        return mapped_column(Text, nullable=True)


class KnowledgeChunk(Base):
    __tablename__ = "knowledge_chunks"

    id: Mapped[uuid.UUID] = mapped_column(
        Uuid(as_uuid=True),
        primary_key=True,
        default=uuid.uuid4,
    )
    document_id: Mapped[uuid.UUID] = mapped_column(
        Uuid(as_uuid=True),
        ForeignKey("knowledge_documents.id", ondelete="CASCADE"),
        nullable=False,
    )
    content: Mapped[str] = mapped_column(Text, nullable=False)
    # Vector(1536) for OpenAI text-embedding-3-small; requires pgvector extension
    # If pgvector package unavailable, column is managed purely via migration SQL
    embedding: Mapped[None] = mapped_column(
        Vector(1536) if _VECTOR_AVAILABLE else Text,
        nullable=True,
    )
    chunk_index: Mapped[int] = mapped_column(Integer, nullable=False)
    token_count: Mapped[int] = mapped_column(Integer, server_default="0", nullable=False)
    # mapped_column("metadata") to avoid SQLAlchemy reserved keyword conflict
    extra_metadata: Mapped[dict | None] = mapped_column("metadata", JSONType, nullable=True)
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True),
        server_default=func.now(),
        nullable=False,
    )

    # Relationships
    document: Mapped["KnowledgeDocument"] = relationship(
        "KnowledgeDocument", back_populates="chunks"
    )

    __table_args__ = (
        Index("idx_knowledge_chunks_document_id", "document_id"),
        Index("idx_knowledge_chunks_chunk_index", "document_id", "chunk_index"),
        # HNSW index on embedding is created via raw SQL in migration
    )


class KnowledgeSearchLog(Base):
    __tablename__ = "knowledge_search_logs"

    id: Mapped[uuid.UUID] = mapped_column(
        Uuid(as_uuid=True),
        primary_key=True,
        default=uuid.uuid4,
    )
    organization_id: Mapped[uuid.UUID] = mapped_column(
        Uuid(as_uuid=True),
        ForeignKey("organizations.id", ondelete="CASCADE"),
        nullable=False,
    )
    user_id: Mapped[str | None] = mapped_column(
        String(36),
        ForeignKey("users.id", ondelete="SET NULL"),
        nullable=True,
    )
    query: Mapped[str] = mapped_column(Text, nullable=False)
    knowledge_base_ids: Mapped[list | None] = mapped_column(JSONType, nullable=True)
    results_count: Mapped[int] = mapped_column(Integer, server_default="0", nullable=False)
    latency_ms: Mapped[int] = mapped_column(Integer, server_default="0", nullable=False)
    created_at: Mapped[datetime] = mapped_column(
        DateTime(timezone=True),
        server_default=func.now(),
        nullable=False,
    )

    # Relationships
    organization: Mapped["Organization"] = relationship(
        "Organization", foreign_keys=[organization_id]
    )
    user: Mapped["User"] = relationship(
        "User", foreign_keys=[user_id]
    )

    __table_args__ = (
        Index("idx_knowledge_search_logs_organization_id", "organization_id"),
        Index("idx_knowledge_search_logs_user_id", "user_id"),
        Index("idx_knowledge_search_logs_created_at", "created_at"),
    )