fischer-agentkit/tests/documents/test_db.py

255 lines
8.6 KiB
Python

"""Tests for document DB persistence and DocumentService metadata operations.
Covers U1: DocumentService core architecture + database model.
Renderer-specific tests live in test_word_renderer.py etc.
"""
from __future__ import annotations
import asyncio
from pathlib import Path
import pytest
from agentkit.documents.db import (
delete_document,
get_conversation_documents,
get_document_by_id,
init_documents_db,
insert_document,
)
from agentkit.documents.models import DocumentMeta
from agentkit.documents.service import DocumentService, _sanitize_filename
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture
def tmp_db(tmp_path: Path) -> Path:
"""Provide a fresh documents DB for each test."""
db_path = tmp_path / "test_documents.db"
asyncio.run(init_documents_db(db_path))
return db_path
def _make_meta(
doc_id: str = "test-id-1",
filename: str = "report.docx",
conversation_id: str = "conv-1",
format: str = "word",
created_at: str = "2026-06-23T00:00:00+00:00",
) -> DocumentMeta:
return DocumentMeta(
id=doc_id,
filename=filename,
stored_name=f"{doc_id}.docx",
format=format,
size=1024,
conversation_id=conversation_id,
created_at=created_at,
)
# ---------------------------------------------------------------------------
# init_documents_db
# ---------------------------------------------------------------------------
async def test_init_db_idempotent(tmp_path: Path) -> None:
"""init_documents_db called twice should not raise."""
db_path = tmp_path / "test.db"
await init_documents_db(db_path)
await init_documents_db(db_path) # second call is a no-op
assert db_path.exists()
async def test_init_db_creates_parent_dir(tmp_path: Path) -> None:
"""init_documents_db creates parent directories if missing."""
db_path = tmp_path / "nested" / "deep" / "test.db"
await init_documents_db(db_path)
assert db_path.exists()
# ---------------------------------------------------------------------------
# insert + query
# ---------------------------------------------------------------------------
async def test_insert_and_get_by_id(tmp_db: Path) -> None:
"""Inserted document is retrievable by id."""
meta = _make_meta()
await insert_document(meta, tmp_db)
result = await get_document_by_id("test-id-1", tmp_db)
assert result is not None
assert result.id == "test-id-1"
assert result.filename == "report.docx"
assert result.format == "word"
assert result.size == 1024
assert result.conversation_id == "conv-1"
async def test_get_by_id_not_found(tmp_db: Path) -> None:
"""Non-existent id returns None."""
result = await get_document_by_id("does-not-exist", tmp_db)
assert result is None
async def test_get_conversation_documents(tmp_db: Path) -> None:
"""Multiple documents for a conversation are returned newest-first."""
meta1 = _make_meta(doc_id="doc-1", created_at="2026-06-23T10:00:00+00:00")
meta2 = _make_meta(doc_id="doc-2", created_at="2026-06-23T11:00:00+00:00")
meta3 = _make_meta(
doc_id="doc-3", conversation_id="conv-2", created_at="2026-06-23T12:00:00+00:00"
)
await insert_document(meta1, tmp_db)
await insert_document(meta2, tmp_db)
await insert_document(meta3, tmp_db)
conv1_docs = await get_conversation_documents("conv-1", tmp_db)
assert len(conv1_docs) == 2
# Newest first
assert conv1_docs[0].id == "doc-2"
assert conv1_docs[1].id == "doc-1"
conv2_docs = await get_conversation_documents("conv-2", tmp_db)
assert len(conv2_docs) == 1
assert conv2_docs[0].id == "doc-3"
async def test_get_conversation_documents_empty(tmp_db: Path) -> None:
"""Non-existent conversation_id returns empty list."""
result = await get_conversation_documents("no-such-conv", tmp_db)
assert result == []
# ---------------------------------------------------------------------------
# delete
# ---------------------------------------------------------------------------
async def test_delete_document(tmp_db: Path) -> None:
"""Delete removes the row and returns True; second delete returns False."""
meta = _make_meta()
await insert_document(meta, tmp_db)
deleted = await delete_document("test-id-1", tmp_db)
assert deleted is True
# Second delete is a no-op
deleted_again = await delete_document("test-id-1", tmp_db)
assert deleted_again is False
# Row is gone
result = await get_document_by_id("test-id-1", tmp_db)
assert result is None
# ---------------------------------------------------------------------------
# _sanitize_filename (path traversal protection)
# ---------------------------------------------------------------------------
def test_sanitize_filename_removes_path_separators() -> None:
"""Path traversal characters are stripped — no '/' or '\\' survives."""
# The sanitizer replaces path separators with '_' then keeps alnum + . _ -
# Key security property: no '/' or '\\' remains, so path traversal is blocked.
result1 = _sanitize_filename("../../etc/passwd")
assert "/" not in result1
assert "\\" not in result1
assert "passwd" in result1
result2 = _sanitize_filename("..\\..\\windows\\system32")
assert "/" not in result2
assert "\\" not in result2
assert "system32" in result2
# Normal filenames are preserved
assert _sanitize_filename("safe-name_v1.0.txt") == "safe-name_v1.0.txt"
def test_sanitize_filename_empty() -> None:
"""Empty input returns empty string; separator-only input is neutralized."""
assert _sanitize_filename("") == ""
# Separator-only input becomes underscores — no path traversal possible.
result = _sanitize_filename("///")
assert "/" not in result
assert "\\" not in result
# ---------------------------------------------------------------------------
# DocumentService (metadata + download path, no rendering in U1)
# ---------------------------------------------------------------------------
async def test_service_get_download_path(tmp_path: Path) -> None:
"""get_download_path finds the file on disk by trying known extensions."""
db_path = tmp_path / "test.db"
upload_dir = tmp_path / "uploads"
await init_documents_db(db_path)
service = DocumentService(upload_dir=upload_dir, db_path=db_path)
# Create a fake file on disk
doc_id = "abc123"
fake_file = upload_dir / f"{doc_id}.docx"
upload_dir.mkdir(parents=True, exist_ok=True)
fake_file.write_bytes(b"fake docx content")
path = service.get_download_path(doc_id)
assert path is not None
assert path.name == f"{doc_id}.docx"
async def test_service_get_download_path_not_found(tmp_path: Path) -> None:
"""get_download_path returns None when no file exists."""
db_path = tmp_path / "test.db"
upload_dir = tmp_path / "uploads"
await init_documents_db(db_path)
service = DocumentService(upload_dir=upload_dir, db_path=db_path)
path = service.get_download_path("nonexistent-id")
assert path is None
async def test_service_create_without_renderer_raises(tmp_path: Path) -> None:
"""create_document raises ValueError when no renderer is registered."""
db_path = tmp_path / "test.db"
upload_dir = tmp_path / "uploads"
await init_documents_db(db_path)
service = DocumentService(upload_dir=upload_dir, db_path=db_path)
with pytest.raises(ValueError, match="No renderer registered"):
await service.create_document(
format="word", content="# Test", conversation_id="conv-1"
)
async def test_service_create_unsupported_format_raises(tmp_path: Path) -> None:
"""create_document raises ValueError for unsupported format."""
db_path = tmp_path / "test.db"
await init_documents_db(db_path)
service = DocumentService(upload_dir=tmp_path / "uploads", db_path=db_path)
with pytest.raises(ValueError, match="Unsupported format"):
await service.create_document(
format="pptx", content="# Test", conversation_id="conv-1"
)
async def test_service_get_conversation_documents(tmp_path: Path) -> None:
"""DocumentService.get_conversation_documents delegates to db module."""
db_path = tmp_path / "test.db"
await init_documents_db(db_path)
meta = _make_meta()
await insert_document(meta, db_path)
service = DocumentService(upload_dir=tmp_path / "uploads", db_path=db_path)
docs = await service.get_conversation_documents("conv-1")
assert len(docs) == 1
assert docs[0].id == "test-id-1"