fischer-agentkit/tests/tools/test_document_tool.py

404 lines
14 KiB
Python

"""Tests for DocumentTool — Agent tool wrapper (U6 create + U9 read)."""
from __future__ import annotations
from pathlib import Path
import pytest
from agentkit.documents.db import init_documents_db
from agentkit.documents.renderers.excel_renderer import ExcelRenderer
from agentkit.documents.renderers.pdf_renderer import PDFRenderer
from agentkit.documents.renderers.word_renderer import WordRenderer
from agentkit.documents.service import DocumentService
from agentkit.memory.document_loader import DocumentLoader
from agentkit.tools.document_tool import DocumentTool
@pytest.fixture
def service(tmp_path: Path) -> DocumentService:
"""Provide a DocumentService with all renderers registered."""
db_path = tmp_path / "test.db"
upload_dir = tmp_path / "uploads"
import asyncio
asyncio.run(init_documents_db(db_path))
svc = DocumentService(upload_dir=upload_dir, db_path=db_path)
svc.register_renderer("word", WordRenderer())
svc.register_renderer("excel", ExcelRenderer())
svc.register_renderer("pdf", PDFRenderer())
# TemplateRenderer is used via render_template, not render — but we
# register it under "word" so DocumentService can dispatch to it.
# Actually, DocumentService uses the same renderer for both paths:
# _render_content calls render(), _render_template calls render_template().
# WordRenderer doesn't have render_template, so we need a separate
# renderer for the template path. For U6 tests, we register a
# TemplateRenderer as a second renderer that DocumentService can use
# when template_path is provided.
# ponytail: DocumentService._render_template calls renderer.render_template,
# so we need the renderer to have that method. We register TemplateRenderer
# as the word renderer when template filling is needed. For simplicity,
# we use a composite approach: register WordRenderer for content rendering
# and handle template separately. But the current service design uses
# one renderer per format. Let's just test create without template here.
return svc
@pytest.fixture
def tool(service: DocumentService) -> DocumentTool:
return DocumentTool(service=service)
# ---------------------------------------------------------------------------
# create action — word
# ---------------------------------------------------------------------------
async def test_create_word(tool: DocumentTool) -> None:
"""format=word creates a .docx and returns success + document metadata."""
result = await tool.execute(
format="word",
content="# Test Report\n\nThis is a test paragraph.\n",
conversation_id="conv-1",
)
assert result["success"] is True
assert result["document"]["format"] == "word"
assert result["document"]["filename"].endswith(".docx")
assert result["document"]["size"] > 0
assert result["document"]["conversation_id"] == "conv-1"
assert result["document"]["id"] # UUID is set
async def test_create_excel(tool: DocumentTool) -> None:
"""format=excel creates a .xlsx from JSON input."""
result = await tool.execute(
format="excel",
content='{"Data": [["A", "B"], ["1", "2"]]}',
conversation_id="conv-1",
)
assert result["success"] is True
assert result["document"]["format"] == "excel"
assert result["document"]["filename"].endswith(".xlsx")
async def test_create_pdf(tool: DocumentTool) -> None:
"""format=pdf creates a .pdf from Markdown."""
result = await tool.execute(
format="pdf",
content="# PDF Title\n\nParagraph text.\n",
conversation_id="conv-1",
)
assert result["success"] is True
assert result["document"]["format"] == "pdf"
assert result["document"]["filename"].endswith(".pdf")
async def test_create_with_filename(tool: DocumentTool) -> None:
"""Custom filename is used in the document metadata."""
result = await tool.execute(
format="word",
content="# Test",
conversation_id="conv-1",
filename="my-report.docx",
)
assert result["success"] is True
assert result["document"]["filename"] == "my-report.docx"
# ---------------------------------------------------------------------------
# error paths
# ---------------------------------------------------------------------------
async def test_missing_format(tool: DocumentTool) -> None:
"""Missing format returns success=False."""
result = await tool.execute(
content="# Test",
conversation_id="conv-1",
)
assert result["success"] is False
assert "format" in result["error"]
async def test_missing_conversation_id(tool: DocumentTool) -> None:
"""Missing conversation_id returns success=False."""
result = await tool.execute(
format="word",
content="# Test",
)
assert result["success"] is False
assert "conversation_id" in result["error"]
async def test_missing_content(tool: DocumentTool) -> None:
"""Missing content returns success=False."""
result = await tool.execute(
format="word",
content="",
conversation_id="conv-1",
)
assert result["success"] is False
assert "content" in result["error"]
async def test_invalid_format(tool: DocumentTool) -> None:
"""Unsupported format returns success=False."""
result = await tool.execute(
format="pptx",
content="# Test",
conversation_id="conv-1",
)
assert result["success"] is False
# ---------------------------------------------------------------------------
# tool registration
# ---------------------------------------------------------------------------
def test_tool_name_and_schema(tool: DocumentTool) -> None:
"""Tool has correct name and input_schema."""
assert tool.name == "document"
schema = tool.input_schema
assert schema["type"] == "object"
assert "action" in schema["properties"]
assert "format" in schema["properties"]
assert "content" in schema["properties"]
assert "conversation_id" in schema["properties"]
assert "filename" in schema["properties"]
# U9: conversation_id is the only hard-required field; action defaults to "create"
assert "conversation_id" in schema["required"]
assert schema["properties"]["action"]["enum"] == ["create", "read"]
async def test_created_document_persisted(tool: DocumentTool, service: DocumentService) -> None:
"""Created document is persisted and retrievable via service."""
result = await tool.execute(
format="word",
content="# Persisted",
conversation_id="conv-persist",
)
assert result["success"] is True
doc_id = result["document"]["id"]
# Retrieve via service
docs = await service.get_conversation_documents("conv-persist")
assert len(docs) == 1
assert docs[0].id == doc_id
# Retrieve single doc
doc = await service.get_document(doc_id)
assert doc is not None
assert doc.filename == result["document"]["filename"]
# ---------------------------------------------------------------------------
# read action (U9)
# ---------------------------------------------------------------------------
async def test_read_text_file(tool: DocumentTool, tmp_path: Path) -> None:
"""action='read' extracts text from a .txt file."""
f = tmp_path / "notes.txt"
f.write_text("Hello world\nLine two", encoding="utf-8")
result = await tool.execute(action="read", filename=str(f), conversation_id="conv-1")
assert result["success"] is True
assert "Hello world" in result["content"]
assert result["metadata"]["format"] == "text"
async def test_read_markdown_file(tool: DocumentTool, tmp_path: Path) -> None:
"""action='read' extracts text from a .md file, preserving content."""
f = tmp_path / "doc.md"
f.write_text("# Title\n\nParagraph.\n", encoding="utf-8")
result = await tool.execute(action="read", filename=str(f), conversation_id="conv-1")
assert result["success"] is True
assert "# Title" in result["content"]
assert result["metadata"]["format"] == "markdown"
assert result["title"] == "Title"
async def test_read_word_file(tool: DocumentTool, tmp_path: Path) -> None:
"""action='read' extracts text from a .docx file created by the tool itself."""
# First create a docx
create_result = await tool.execute(
action="create",
format="word",
content="# Read Test\n\nContent for reading.",
conversation_id="conv-1",
filename="read-test.docx",
)
assert create_result["success"] is True
# The file is stored in service's upload_dir — find it via service
doc_id = create_result["document"]["id"]
# ponytail: use service.get_download_path to locate the file on disk
svc = tool._service # type: ignore[attr-defined]
path = svc.get_download_path(doc_id)
assert path is not None and path.exists()
result = await tool.execute(action="read", filename=str(path), conversation_id="conv-1")
assert result["success"] is True
assert "Read Test" in result["content"]
assert "Content for reading" in result["content"]
assert result["metadata"]["format"] == "docx"
async def test_read_excel_file(tool: DocumentTool, tmp_path: Path) -> None:
"""action='read' extracts text from a .xlsx file created by the tool itself."""
create_result = await tool.execute(
action="create",
format="excel",
content='{"Sheet1": [["Name", "Age"], ["Alice", "30"], ["Bob", "25"]]}',
conversation_id="conv-1",
filename="read-test.xlsx",
)
assert create_result["success"] is True
doc_id = create_result["document"]["id"]
svc = tool._service # type: ignore[attr-defined]
path = svc.get_download_path(doc_id)
assert path is not None and path.exists()
result = await tool.execute(action="read", filename=str(path), conversation_id="conv-1")
assert result["success"] is True
assert "Alice" in result["content"]
assert "Bob" in result["content"]
assert result["metadata"]["format"] == "xlsx"
assert result["metadata"]["sheet_count"] >= 1
async def test_read_missing_file(tool: DocumentTool, tmp_path: Path) -> None:
"""action='read' with non-existent file returns success=False."""
result = await tool.execute(
action="read",
filename=str(tmp_path / "nonexistent.txt"),
conversation_id="conv-1",
)
assert result["success"] is False
assert "not found" in result["error"].lower() or "no such file" in result["error"].lower()
async def test_read_missing_filename(tool: DocumentTool) -> None:
"""action='read' without filename returns success=False."""
result = await tool.execute(action="read", conversation_id="conv-1")
assert result["success"] is False
assert "filename" in result["error"].lower()
async def test_read_uses_content_as_path_fallback(tool: DocumentTool, tmp_path: Path) -> None:
"""action='read' falls back to 'content' as file path when filename is absent."""
f = tmp_path / "via-content.txt"
f.write_text("content-as-path", encoding="utf-8")
result = await tool.execute(
action="read",
content=str(f),
conversation_id="conv-1",
)
assert result["success"] is True
assert "content-as-path" in result["content"]
async def test_unknown_action(tool: DocumentTool) -> None:
"""Unknown action returns success=False."""
result = await tool.execute(action="delete", conversation_id="conv-1")
assert result["success"] is False
assert "unknown action" in result["error"].lower()
async def test_create_action_explicit(tool: DocumentTool) -> None:
"""action='create' explicitly works the same as default."""
result = await tool.execute(
action="create",
format="word",
content="# Explicit",
conversation_id="conv-1",
)
assert result["success"] is True
assert result["document"]["format"] == "word"
# ---------------------------------------------------------------------------
# DocumentLoader Excel support (U9)
# ---------------------------------------------------------------------------
def test_loader_detects_xlsx() -> None:
"""DocumentLoader detects .xlsx and .xls as xlsx format."""
from agentkit.memory.document_loader import _detect_format
assert _detect_format("data.xlsx") == "xlsx"
assert _detect_format("data.XLS") == "xlsx"
assert _detect_format("data.xls") == "xlsx"
def test_loader_parses_xlsx(tmp_path: Path) -> None:
"""DocumentLoader._parse_xlsx extracts sheet data as Markdown table."""
import openpyxl
f = tmp_path / "test.xlsx"
wb = openpyxl.Workbook()
ws = wb.active
ws.title = "Data"
ws.append(["Name", "Age"])
ws.append(["Alice", 30])
ws.append(["Bob", 25])
wb.save(f)
wb.close()
loader = DocumentLoader()
doc = loader.load(f)
assert "Alice" in doc.content
assert "Bob" in doc.content
assert "Name" in doc.content
assert doc.metadata["format"] == "xlsx"
assert doc.metadata["sheet_count"] == 1
assert doc.metadata["row_count"] == 3
# Markdown table separator should be present
assert "---" in doc.content
def test_loader_parses_xlsx_multiple_sheets(tmp_path: Path) -> None:
"""DocumentLoader handles multiple sheets, each as a separate H2 section."""
import openpyxl
f = tmp_path / "multi.xlsx"
wb = openpyxl.Workbook()
ws1 = wb.active
ws1.title = "Sheet1"
ws1.append(["A", "B"])
ws1.append(["1", "2"])
ws2 = wb.create_sheet("Sheet2")
ws2.append(["C", "D"])
ws2.append(["3", "4"])
wb.save(f)
wb.close()
loader = DocumentLoader()
doc = loader.load(f)
assert "## Sheet1" in doc.content
assert "## Sheet2" in doc.content
assert doc.metadata["sheet_count"] == 2
def test_loader_parses_xlsx_empty_cells(tmp_path: Path) -> None:
"""DocumentLoader handles empty cells gracefully (renders as empty string)."""
import openpyxl
f = tmp_path / "empty.xlsx"
wb = openpyxl.Workbook()
ws = wb.active
ws.append(["A", "B", "C"])
ws.append(["x", None, "z"])
wb.save(f)
wb.close()
loader = DocumentLoader()
doc = loader.load(f)
# Empty cell should not crash; row should still have 3 columns
assert "x" in doc.content
assert "z" in doc.content