"""Add knowledge base tables with pgvector support Revision ID: e5f7g9h1cd45 Revises: d4f6g8h0ab23 Create Date: 2026-05-23 12:00:00.000000 """ from typing import Sequence, Union import sqlalchemy as sa from alembic import op from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. revision: str = "e5f7g9h1cd45" down_revision: Union[str, None] = "d4f6g8h0ab23" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None def upgrade() -> None: # ------------------------------------------------------------------ # # 1. Enable pgvector extension # ------------------------------------------------------------------ # op.execute("CREATE EXTENSION IF NOT EXISTS vector") # ------------------------------------------------------------------ # # 2. knowledge_bases # ------------------------------------------------------------------ # op.create_table( "knowledge_bases", sa.Column( "id", postgresql.UUID(as_uuid=True), primary_key=True, nullable=False, ), sa.Column( "organization_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("organizations.id", ondelete="CASCADE"), nullable=False, ), sa.Column("name", sa.String(200), nullable=False), sa.Column("type", sa.String(20), nullable=False), sa.Column("description", sa.Text, nullable=True), sa.Column("document_count", sa.Integer, server_default="0", nullable=False), sa.Column("status", sa.String(20), server_default="active", nullable=False), sa.Column( "created_by", postgresql.UUID(as_uuid=True), sa.ForeignKey("users.id", ondelete="SET NULL"), nullable=True, ), sa.Column( "created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False, ), sa.Column( "updated_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False, ), ) op.create_index("idx_knowledge_bases_organization_id", "knowledge_bases", ["organization_id"]) op.create_index("idx_knowledge_bases_type", "knowledge_bases", ["type"]) op.create_index("idx_knowledge_bases_status", "knowledge_bases", ["status"]) # ------------------------------------------------------------------ # # 3. knowledge_documents # ------------------------------------------------------------------ # op.create_table( "knowledge_documents", sa.Column( "id", postgresql.UUID(as_uuid=True), primary_key=True, nullable=False, ), sa.Column( "knowledge_base_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("knowledge_bases.id", ondelete="CASCADE"), nullable=False, ), sa.Column("title", sa.String(500), nullable=False), sa.Column("source_type", sa.String(20), nullable=False), sa.Column("source_url", sa.String(2000), nullable=True), sa.Column("content", sa.Text, nullable=False), sa.Column("content_hash", sa.String(64), nullable=False), sa.Column("chunk_count", sa.Integer, server_default="0", nullable=False), sa.Column("status", sa.String(20), server_default="processing", nullable=False), sa.Column("error_message", sa.Text, nullable=True), sa.Column("metadata", postgresql.JSONB(astext_type=sa.Text()), nullable=True), sa.Column( "created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False, ), sa.Column( "updated_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False, ), ) op.create_index( "idx_knowledge_documents_knowledge_base_id", "knowledge_documents", ["knowledge_base_id"], ) op.create_index("idx_knowledge_documents_status", "knowledge_documents", ["status"]) op.create_index( "idx_knowledge_documents_content_hash", "knowledge_documents", ["content_hash"] ) # ------------------------------------------------------------------ # # 4. knowledge_chunks (embedding column via raw SQL for vector type) # ------------------------------------------------------------------ # op.create_table( "knowledge_chunks", sa.Column( "id", postgresql.UUID(as_uuid=True), primary_key=True, nullable=False, ), sa.Column( "document_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("knowledge_documents.id", ondelete="CASCADE"), nullable=False, ), sa.Column("content", sa.Text, nullable=False), sa.Column("chunk_index", sa.Integer, nullable=False), sa.Column("token_count", sa.Integer, server_default="0", nullable=False), sa.Column("metadata", postgresql.JSONB(astext_type=sa.Text()), nullable=True), sa.Column( "created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False, ), ) # Add vector embedding column via raw SQL (pgvector type not in SA dialect) # Dimension 1536 matches OpenAI text-embedding-3-small op.execute( "ALTER TABLE knowledge_chunks ADD COLUMN embedding vector(1536)" ) op.create_index("idx_knowledge_chunks_document_id", "knowledge_chunks", ["document_id"]) op.create_index( "idx_knowledge_chunks_chunk_index", "knowledge_chunks", ["document_id", "chunk_index"], ) # HNSW index for approximate nearest-neighbor cosine similarity search op.execute( "CREATE INDEX ix_knowledge_chunks_embedding " "ON knowledge_chunks USING hnsw (embedding vector_cosine_ops)" ) # ------------------------------------------------------------------ # # 5. knowledge_search_logs # ------------------------------------------------------------------ # op.create_table( "knowledge_search_logs", sa.Column( "id", postgresql.UUID(as_uuid=True), primary_key=True, nullable=False, ), sa.Column( "organization_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("organizations.id", ondelete="CASCADE"), nullable=False, ), sa.Column( "user_id", postgresql.UUID(as_uuid=True), sa.ForeignKey("users.id", ondelete="SET NULL"), nullable=True, ), sa.Column("query", sa.Text, nullable=False), sa.Column("knowledge_base_ids", postgresql.JSONB(astext_type=sa.Text()), nullable=True), sa.Column("results_count", sa.Integer, server_default="0", nullable=False), sa.Column("latency_ms", sa.Integer, server_default="0", nullable=False), sa.Column( "created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False, ), ) op.create_index( "idx_knowledge_search_logs_organization_id", "knowledge_search_logs", ["organization_id"], ) op.create_index( "idx_knowledge_search_logs_user_id", "knowledge_search_logs", ["user_id"] ) op.create_index( "idx_knowledge_search_logs_created_at", "knowledge_search_logs", ["created_at"] ) def downgrade() -> None: # Drop tables in reverse dependency order op.drop_table("knowledge_search_logs") op.execute("DROP INDEX IF EXISTS ix_knowledge_chunks_embedding") op.drop_table("knowledge_chunks") op.drop_table("knowledge_documents") op.drop_table("knowledge_bases") # Note: we do NOT drop the vector extension as other tables might rely on it