geo/backend/app/models/knowledge.py

212 lines
7.7 KiB
Python

import uuid
from datetime import datetime
from sqlalchemy import String, Integer, ForeignKey, Index, func, Text
from sqlalchemy import Uuid
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.database import Base, JSONType
# pgvector Vector type - imported conditionally
try:
from pgvector.sqlalchemy import Vector
_VECTOR_AVAILABLE = True
except ImportError:
# pgvector package not installed; Vector columns will be skipped in ORM
# The migration file handles the actual column creation via raw SQL
Vector = None
_VECTOR_AVAILABLE = False
class KnowledgeBase(Base):
__tablename__ = "knowledge_bases"
id: Mapped[uuid.UUID] = mapped_column(
Uuid(as_uuid=True),
primary_key=True,
default=uuid.uuid4,
)
organization_id: Mapped[uuid.UUID] = mapped_column(
Uuid(as_uuid=True),
ForeignKey("organizations.id", ondelete="CASCADE"),
nullable=False,
)
name: Mapped[str] = mapped_column(String(200), nullable=False)
type: Mapped[str] = mapped_column(String(20), nullable=False) # "industry" / "enterprise"
description: Mapped[str | None] = mapped_column(Text, nullable=True)
document_count: Mapped[int] = mapped_column(Integer, server_default="0", nullable=False)
status: Mapped[str] = mapped_column(String(20), server_default="active", nullable=False)
created_by: Mapped[str | None] = mapped_column(
String(36),
ForeignKey("users.id", ondelete="SET NULL"),
nullable=True,
)
created_at: Mapped[datetime] = mapped_column(
server_default=func.now(),
nullable=False,
)
updated_at: Mapped[datetime] = mapped_column(
server_default=func.now(),
onupdate=func.now(),
nullable=False,
)
# Relationships
organization: Mapped["Organization"] = relationship(
"Organization", back_populates="knowledge_bases"
)
creator: Mapped["User"] = relationship(
"User", foreign_keys=[created_by]
)
documents: Mapped[list["KnowledgeDocument"]] = relationship(
"KnowledgeDocument", back_populates="knowledge_base", cascade="all, delete-orphan"
)
__table_args__ = (
Index("idx_knowledge_bases_organization_id", "organization_id"),
Index("idx_knowledge_bases_type", "type"),
Index("idx_knowledge_bases_status", "status"),
)
class KnowledgeDocument(Base):
__tablename__ = "knowledge_documents"
id: Mapped[uuid.UUID] = mapped_column(
Uuid(as_uuid=True),
primary_key=True,
default=uuid.uuid4,
)
knowledge_base_id: Mapped[uuid.UUID] = mapped_column(
Uuid(as_uuid=True),
ForeignKey("knowledge_bases.id", ondelete="CASCADE"),
nullable=False,
)
title: Mapped[str] = mapped_column(String(500), nullable=False)
source_type: Mapped[str] = mapped_column(String(20), nullable=False) # "text" / "url" / "pdf" / "markdown"
source_url: Mapped[str | None] = mapped_column(String(2000), nullable=True)
content: Mapped[str] = mapped_column(Text, nullable=False)
content_hash: Mapped[str] = mapped_column(String(64), nullable=False)
chunk_count: Mapped[int] = mapped_column(Integer, server_default="0", nullable=False)
status: Mapped[str] = mapped_column(String(20), server_default="processing", nullable=False) # "processing" / "ready" / "failed"
error_message: Mapped[str | None] = mapped_column(Text, nullable=True)
# mapped_column("metadata") to avoid SQLAlchemy reserved keyword conflict
extra_metadata: Mapped[dict | None] = mapped_column("metadata", JSONType, nullable=True)
created_at: Mapped[datetime] = mapped_column(
server_default=func.now(),
nullable=False,
)
updated_at: Mapped[datetime] = mapped_column(
server_default=func.now(),
onupdate=func.now(),
nullable=False,
)
# Relationships
knowledge_base: Mapped["KnowledgeBase"] = relationship(
"KnowledgeBase", back_populates="documents"
)
chunks: Mapped[list["KnowledgeChunk"]] = relationship(
"KnowledgeChunk", back_populates="document", cascade="all, delete-orphan"
)
__table_args__ = (
Index("idx_knowledge_documents_knowledge_base_id", "knowledge_base_id"),
Index("idx_knowledge_documents_status", "status"),
Index("idx_knowledge_documents_content_hash", "content_hash"),
)
# Conditionally build KnowledgeChunk with or without Vector column
def _build_chunk_columns():
"""Return embedding column definition based on pgvector availability."""
if _VECTOR_AVAILABLE:
return mapped_column(Vector(1536), nullable=True)
else:
# Fallback: use Text to store serialized embeddings (not for production use)
# The actual vector column is created by the migration via raw SQL
return mapped_column(Text, nullable=True)
class KnowledgeChunk(Base):
__tablename__ = "knowledge_chunks"
id: Mapped[uuid.UUID] = mapped_column(
Uuid(as_uuid=True),
primary_key=True,
default=uuid.uuid4,
)
document_id: Mapped[uuid.UUID] = mapped_column(
Uuid(as_uuid=True),
ForeignKey("knowledge_documents.id", ondelete="CASCADE"),
nullable=False,
)
content: Mapped[str] = mapped_column(Text, nullable=False)
# Vector(1536) for OpenAI text-embedding-3-small; requires pgvector extension
# If pgvector package unavailable, column is managed purely via migration SQL
embedding: Mapped[None] = mapped_column(
Vector(1536) if _VECTOR_AVAILABLE else Text,
nullable=True,
)
chunk_index: Mapped[int] = mapped_column(Integer, nullable=False)
token_count: Mapped[int] = mapped_column(Integer, server_default="0", nullable=False)
# mapped_column("metadata") to avoid SQLAlchemy reserved keyword conflict
extra_metadata: Mapped[dict | None] = mapped_column("metadata", JSONType, nullable=True)
created_at: Mapped[datetime] = mapped_column(
server_default=func.now(),
nullable=False,
)
# Relationships
document: Mapped["KnowledgeDocument"] = relationship(
"KnowledgeDocument", back_populates="chunks"
)
__table_args__ = (
Index("idx_knowledge_chunks_document_id", "document_id"),
Index("idx_knowledge_chunks_chunk_index", "document_id", "chunk_index"),
# HNSW index on embedding is created via raw SQL in migration
)
class KnowledgeSearchLog(Base):
__tablename__ = "knowledge_search_logs"
id: Mapped[uuid.UUID] = mapped_column(
Uuid(as_uuid=True),
primary_key=True,
default=uuid.uuid4,
)
organization_id: Mapped[uuid.UUID] = mapped_column(
Uuid(as_uuid=True),
ForeignKey("organizations.id", ondelete="CASCADE"),
nullable=False,
)
user_id: Mapped[str | None] = mapped_column(
String(36),
ForeignKey("users.id", ondelete="SET NULL"),
nullable=True,
)
query: Mapped[str] = mapped_column(Text, nullable=False)
knowledge_base_ids: Mapped[list | None] = mapped_column(JSONType, nullable=True)
results_count: Mapped[int] = mapped_column(Integer, server_default="0", nullable=False)
latency_ms: Mapped[int] = mapped_column(Integer, server_default="0", nullable=False)
created_at: Mapped[datetime] = mapped_column(
server_default=func.now(),
nullable=False,
)
# Relationships
organization: Mapped["Organization"] = relationship(
"Organization", foreign_keys=[organization_id]
)
user: Mapped["User"] = relationship(
"User", foreign_keys=[user_id]
)
__table_args__ = (
Index("idx_knowledge_search_logs_organization_id", "organization_id"),
Index("idx_knowledge_search_logs_user_id", "user_id"),
Index("idx_knowledge_search_logs_created_at", "created_at"),
)