224 lines
8.0 KiB
Python
224 lines
8.0 KiB
Python
"""Add knowledge base tables with pgvector support
|
|
|
|
Revision ID: e5f7g9h1cd45
|
|
Revises: d4f6g8h0ab23
|
|
Create Date: 2026-05-23 12:00:00.000000
|
|
|
|
"""
|
|
from typing import Sequence, Union
|
|
|
|
import sqlalchemy as sa
|
|
from alembic import op
|
|
from sqlalchemy.dialects import postgresql
|
|
|
|
# revision identifiers, used by Alembic.
|
|
revision: str = "e5f7g9h1cd45"
|
|
down_revision: Union[str, None] = "d4f6g8h0ab23"
|
|
branch_labels: Union[str, Sequence[str], None] = None
|
|
depends_on: Union[str, Sequence[str], None] = None
|
|
|
|
|
|
def upgrade() -> None:
|
|
# ------------------------------------------------------------------ #
|
|
# 1. Enable pgvector extension
|
|
# ------------------------------------------------------------------ #
|
|
op.execute("CREATE EXTENSION IF NOT EXISTS vector")
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# 2. knowledge_bases
|
|
# ------------------------------------------------------------------ #
|
|
op.create_table(
|
|
"knowledge_bases",
|
|
sa.Column(
|
|
"id",
|
|
postgresql.UUID(as_uuid=True),
|
|
primary_key=True,
|
|
nullable=False,
|
|
),
|
|
sa.Column(
|
|
"organization_id",
|
|
postgresql.UUID(as_uuid=True),
|
|
sa.ForeignKey("organizations.id", ondelete="CASCADE"),
|
|
nullable=False,
|
|
),
|
|
sa.Column("name", sa.String(200), nullable=False),
|
|
sa.Column("type", sa.String(20), nullable=False),
|
|
sa.Column("description", sa.Text, nullable=True),
|
|
sa.Column("document_count", sa.Integer, server_default="0", nullable=False),
|
|
sa.Column("status", sa.String(20), server_default="active", nullable=False),
|
|
sa.Column(
|
|
"created_by",
|
|
postgresql.UUID(as_uuid=True),
|
|
sa.ForeignKey("users.id", ondelete="SET NULL"),
|
|
nullable=True,
|
|
),
|
|
sa.Column(
|
|
"created_at",
|
|
sa.DateTime(timezone=True),
|
|
server_default=sa.text("now()"),
|
|
nullable=False,
|
|
),
|
|
sa.Column(
|
|
"updated_at",
|
|
sa.DateTime(timezone=True),
|
|
server_default=sa.text("now()"),
|
|
nullable=False,
|
|
),
|
|
)
|
|
op.create_index("idx_knowledge_bases_organization_id", "knowledge_bases", ["organization_id"])
|
|
op.create_index("idx_knowledge_bases_type", "knowledge_bases", ["type"])
|
|
op.create_index("idx_knowledge_bases_status", "knowledge_bases", ["status"])
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# 3. knowledge_documents
|
|
# ------------------------------------------------------------------ #
|
|
op.create_table(
|
|
"knowledge_documents",
|
|
sa.Column(
|
|
"id",
|
|
postgresql.UUID(as_uuid=True),
|
|
primary_key=True,
|
|
nullable=False,
|
|
),
|
|
sa.Column(
|
|
"knowledge_base_id",
|
|
postgresql.UUID(as_uuid=True),
|
|
sa.ForeignKey("knowledge_bases.id", ondelete="CASCADE"),
|
|
nullable=False,
|
|
),
|
|
sa.Column("title", sa.String(500), nullable=False),
|
|
sa.Column("source_type", sa.String(20), nullable=False),
|
|
sa.Column("source_url", sa.String(2000), nullable=True),
|
|
sa.Column("content", sa.Text, nullable=False),
|
|
sa.Column("content_hash", sa.String(64), nullable=False),
|
|
sa.Column("chunk_count", sa.Integer, server_default="0", nullable=False),
|
|
sa.Column("status", sa.String(20), server_default="processing", nullable=False),
|
|
sa.Column("error_message", sa.Text, nullable=True),
|
|
sa.Column("metadata", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
|
|
sa.Column(
|
|
"created_at",
|
|
sa.DateTime(timezone=True),
|
|
server_default=sa.text("now()"),
|
|
nullable=False,
|
|
),
|
|
sa.Column(
|
|
"updated_at",
|
|
sa.DateTime(timezone=True),
|
|
server_default=sa.text("now()"),
|
|
nullable=False,
|
|
),
|
|
)
|
|
op.create_index(
|
|
"idx_knowledge_documents_knowledge_base_id",
|
|
"knowledge_documents",
|
|
["knowledge_base_id"],
|
|
)
|
|
op.create_index("idx_knowledge_documents_status", "knowledge_documents", ["status"])
|
|
op.create_index(
|
|
"idx_knowledge_documents_content_hash", "knowledge_documents", ["content_hash"]
|
|
)
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# 4. knowledge_chunks (embedding column via raw SQL for vector type)
|
|
# ------------------------------------------------------------------ #
|
|
op.create_table(
|
|
"knowledge_chunks",
|
|
sa.Column(
|
|
"id",
|
|
postgresql.UUID(as_uuid=True),
|
|
primary_key=True,
|
|
nullable=False,
|
|
),
|
|
sa.Column(
|
|
"document_id",
|
|
postgresql.UUID(as_uuid=True),
|
|
sa.ForeignKey("knowledge_documents.id", ondelete="CASCADE"),
|
|
nullable=False,
|
|
),
|
|
sa.Column("content", sa.Text, nullable=False),
|
|
sa.Column("chunk_index", sa.Integer, nullable=False),
|
|
sa.Column("token_count", sa.Integer, server_default="0", nullable=False),
|
|
sa.Column("metadata", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
|
|
sa.Column(
|
|
"created_at",
|
|
sa.DateTime(timezone=True),
|
|
server_default=sa.text("now()"),
|
|
nullable=False,
|
|
),
|
|
)
|
|
|
|
# Add vector embedding column via raw SQL (pgvector type not in SA dialect)
|
|
# Dimension 1536 matches OpenAI text-embedding-3-small
|
|
op.execute(
|
|
"ALTER TABLE knowledge_chunks ADD COLUMN embedding vector(1536)"
|
|
)
|
|
|
|
op.create_index("idx_knowledge_chunks_document_id", "knowledge_chunks", ["document_id"])
|
|
op.create_index(
|
|
"idx_knowledge_chunks_chunk_index",
|
|
"knowledge_chunks",
|
|
["document_id", "chunk_index"],
|
|
)
|
|
|
|
# HNSW index for approximate nearest-neighbor cosine similarity search
|
|
op.execute(
|
|
"CREATE INDEX ix_knowledge_chunks_embedding "
|
|
"ON knowledge_chunks USING hnsw (embedding vector_cosine_ops)"
|
|
)
|
|
|
|
# ------------------------------------------------------------------ #
|
|
# 5. knowledge_search_logs
|
|
# ------------------------------------------------------------------ #
|
|
op.create_table(
|
|
"knowledge_search_logs",
|
|
sa.Column(
|
|
"id",
|
|
postgresql.UUID(as_uuid=True),
|
|
primary_key=True,
|
|
nullable=False,
|
|
),
|
|
sa.Column(
|
|
"organization_id",
|
|
postgresql.UUID(as_uuid=True),
|
|
sa.ForeignKey("organizations.id", ondelete="CASCADE"),
|
|
nullable=False,
|
|
),
|
|
sa.Column(
|
|
"user_id",
|
|
postgresql.UUID(as_uuid=True),
|
|
sa.ForeignKey("users.id", ondelete="SET NULL"),
|
|
nullable=True,
|
|
),
|
|
sa.Column("query", sa.Text, nullable=False),
|
|
sa.Column("knowledge_base_ids", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
|
|
sa.Column("results_count", sa.Integer, server_default="0", nullable=False),
|
|
sa.Column("latency_ms", sa.Integer, server_default="0", nullable=False),
|
|
sa.Column(
|
|
"created_at",
|
|
sa.DateTime(timezone=True),
|
|
server_default=sa.text("now()"),
|
|
nullable=False,
|
|
),
|
|
)
|
|
op.create_index(
|
|
"idx_knowledge_search_logs_organization_id",
|
|
"knowledge_search_logs",
|
|
["organization_id"],
|
|
)
|
|
op.create_index(
|
|
"idx_knowledge_search_logs_user_id", "knowledge_search_logs", ["user_id"]
|
|
)
|
|
op.create_index(
|
|
"idx_knowledge_search_logs_created_at", "knowledge_search_logs", ["created_at"]
|
|
)
|
|
|
|
|
|
def downgrade() -> None:
|
|
# Drop tables in reverse dependency order
|
|
op.drop_table("knowledge_search_logs")
|
|
op.execute("DROP INDEX IF EXISTS ix_knowledge_chunks_embedding")
|
|
op.drop_table("knowledge_chunks")
|
|
op.drop_table("knowledge_documents")
|
|
op.drop_table("knowledge_bases")
|
|
# Note: we do NOT drop the vector extension as other tables might rely on it
|