geo/backend/alembic/versions/e5f7g9h1cd45_add_knowledge_...

224 lines
8.0 KiB
Python

"""Add knowledge base tables with pgvector support
Revision ID: e5f7g9h1cd45
Revises: d4f6g8h0ab23
Create Date: 2026-05-23 12:00:00.000000
"""
from typing import Sequence, Union
import sqlalchemy as sa
from alembic import op
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision: str = "e5f7g9h1cd45"
down_revision: Union[str, None] = "d4f6g8h0ab23"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
# ------------------------------------------------------------------ #
# 1. Enable pgvector extension
# ------------------------------------------------------------------ #
op.execute("CREATE EXTENSION IF NOT EXISTS vector")
# ------------------------------------------------------------------ #
# 2. knowledge_bases
# ------------------------------------------------------------------ #
op.create_table(
"knowledge_bases",
sa.Column(
"id",
postgresql.UUID(as_uuid=True),
primary_key=True,
nullable=False,
),
sa.Column(
"organization_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("organizations.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column("name", sa.String(200), nullable=False),
sa.Column("type", sa.String(20), nullable=False),
sa.Column("description", sa.Text, nullable=True),
sa.Column("document_count", sa.Integer, server_default="0", nullable=False),
sa.Column("status", sa.String(20), server_default="active", nullable=False),
sa.Column(
"created_by",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("users.id", ondelete="SET NULL"),
nullable=True,
),
sa.Column(
"created_at",
sa.DateTime(timezone=True),
server_default=sa.text("now()"),
nullable=False,
),
sa.Column(
"updated_at",
sa.DateTime(timezone=True),
server_default=sa.text("now()"),
nullable=False,
),
)
op.create_index("idx_knowledge_bases_organization_id", "knowledge_bases", ["organization_id"])
op.create_index("idx_knowledge_bases_type", "knowledge_bases", ["type"])
op.create_index("idx_knowledge_bases_status", "knowledge_bases", ["status"])
# ------------------------------------------------------------------ #
# 3. knowledge_documents
# ------------------------------------------------------------------ #
op.create_table(
"knowledge_documents",
sa.Column(
"id",
postgresql.UUID(as_uuid=True),
primary_key=True,
nullable=False,
),
sa.Column(
"knowledge_base_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("knowledge_bases.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column("title", sa.String(500), nullable=False),
sa.Column("source_type", sa.String(20), nullable=False),
sa.Column("source_url", sa.String(2000), nullable=True),
sa.Column("content", sa.Text, nullable=False),
sa.Column("content_hash", sa.String(64), nullable=False),
sa.Column("chunk_count", sa.Integer, server_default="0", nullable=False),
sa.Column("status", sa.String(20), server_default="processing", nullable=False),
sa.Column("error_message", sa.Text, nullable=True),
sa.Column("metadata", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
sa.Column(
"created_at",
sa.DateTime(timezone=True),
server_default=sa.text("now()"),
nullable=False,
),
sa.Column(
"updated_at",
sa.DateTime(timezone=True),
server_default=sa.text("now()"),
nullable=False,
),
)
op.create_index(
"idx_knowledge_documents_knowledge_base_id",
"knowledge_documents",
["knowledge_base_id"],
)
op.create_index("idx_knowledge_documents_status", "knowledge_documents", ["status"])
op.create_index(
"idx_knowledge_documents_content_hash", "knowledge_documents", ["content_hash"]
)
# ------------------------------------------------------------------ #
# 4. knowledge_chunks (embedding column via raw SQL for vector type)
# ------------------------------------------------------------------ #
op.create_table(
"knowledge_chunks",
sa.Column(
"id",
postgresql.UUID(as_uuid=True),
primary_key=True,
nullable=False,
),
sa.Column(
"document_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("knowledge_documents.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column("content", sa.Text, nullable=False),
sa.Column("chunk_index", sa.Integer, nullable=False),
sa.Column("token_count", sa.Integer, server_default="0", nullable=False),
sa.Column("metadata", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
sa.Column(
"created_at",
sa.DateTime(timezone=True),
server_default=sa.text("now()"),
nullable=False,
),
)
# Add vector embedding column via raw SQL (pgvector type not in SA dialect)
# Dimension 1536 matches OpenAI text-embedding-3-small
op.execute(
"ALTER TABLE knowledge_chunks ADD COLUMN embedding vector(1536)"
)
op.create_index("idx_knowledge_chunks_document_id", "knowledge_chunks", ["document_id"])
op.create_index(
"idx_knowledge_chunks_chunk_index",
"knowledge_chunks",
["document_id", "chunk_index"],
)
# HNSW index for approximate nearest-neighbor cosine similarity search
op.execute(
"CREATE INDEX ix_knowledge_chunks_embedding "
"ON knowledge_chunks USING hnsw (embedding vector_cosine_ops)"
)
# ------------------------------------------------------------------ #
# 5. knowledge_search_logs
# ------------------------------------------------------------------ #
op.create_table(
"knowledge_search_logs",
sa.Column(
"id",
postgresql.UUID(as_uuid=True),
primary_key=True,
nullable=False,
),
sa.Column(
"organization_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("organizations.id", ondelete="CASCADE"),
nullable=False,
),
sa.Column(
"user_id",
postgresql.UUID(as_uuid=True),
sa.ForeignKey("users.id", ondelete="SET NULL"),
nullable=True,
),
sa.Column("query", sa.Text, nullable=False),
sa.Column("knowledge_base_ids", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
sa.Column("results_count", sa.Integer, server_default="0", nullable=False),
sa.Column("latency_ms", sa.Integer, server_default="0", nullable=False),
sa.Column(
"created_at",
sa.DateTime(timezone=True),
server_default=sa.text("now()"),
nullable=False,
),
)
op.create_index(
"idx_knowledge_search_logs_organization_id",
"knowledge_search_logs",
["organization_id"],
)
op.create_index(
"idx_knowledge_search_logs_user_id", "knowledge_search_logs", ["user_id"]
)
op.create_index(
"idx_knowledge_search_logs_created_at", "knowledge_search_logs", ["created_at"]
)
def downgrade() -> None:
# Drop tables in reverse dependency order
op.drop_table("knowledge_search_logs")
op.execute("DROP INDEX IF EXISTS ix_knowledge_chunks_embedding")
op.drop_table("knowledge_chunks")
op.drop_table("knowledge_documents")
op.drop_table("knowledge_bases")
# Note: we do NOT drop the vector extension as other tables might rely on it