fischer-agentkit/tests/tools/test_document_tool.py

"""Tests for DocumentTool — Agent tool wrapper (U6 create + U9 read)."""

from __future__ import annotations

from pathlib import Path

import pytest

from agentkit.documents.db import init_documents_db
from agentkit.documents.renderers.excel_renderer import ExcelRenderer
from agentkit.documents.renderers.pdf_renderer import PDFRenderer
from agentkit.documents.renderers.word_renderer import WordRenderer
from agentkit.documents.service import DocumentService
from agentkit.memory.document_loader import DocumentLoader
from agentkit.tools.document_tool import DocumentTool


@pytest.fixture
def service(tmp_path: Path) -> DocumentService:
    """Provide a DocumentService with all renderers registered."""
    db_path = tmp_path / "test.db"
    upload_dir = tmp_path / "uploads"
    import asyncio
    asyncio.run(init_documents_db(db_path))

    svc = DocumentService(upload_dir=upload_dir, db_path=db_path)
    svc.register_renderer("word", WordRenderer())
    svc.register_renderer("excel", ExcelRenderer())
    svc.register_renderer("pdf", PDFRenderer())
    # TemplateRenderer is used via render_template, not render — but we
    # register it under "word" so DocumentService can dispatch to it.
    # Actually, DocumentService uses the same renderer for both paths:
    # _render_content calls render(), _render_template calls render_template().
    # WordRenderer doesn't have render_template, so we need a separate
    # renderer for the template path. For U6 tests, we register a
    # TemplateRenderer as a second renderer that DocumentService can use
    # when template_path is provided.
    # ponytail: DocumentService._render_template calls renderer.render_template,
    # so we need the renderer to have that method. We register TemplateRenderer
    # as the word renderer when template filling is needed. For simplicity,
    # we use a composite approach: register WordRenderer for content rendering
    # and handle template separately. But the current service design uses
    # one renderer per format. Let's just test create without template here.
    return svc


@pytest.fixture
def tool(service: DocumentService) -> DocumentTool:
    return DocumentTool(service=service)


# ---------------------------------------------------------------------------
# create action — word
# ---------------------------------------------------------------------------


async def test_create_word(tool: DocumentTool) -> None:
    """format=word creates a .docx and returns success + document metadata."""
    result = await tool.execute(
        format="word",
        content="# Test Report\n\nThis is a test paragraph.\n",
        conversation_id="conv-1",
    )
    assert result["success"] is True
    assert result["document"]["format"] == "word"
    assert result["document"]["filename"].endswith(".docx")
    assert result["document"]["size"] > 0
    assert result["document"]["conversation_id"] == "conv-1"
    assert result["document"]["id"]  # UUID is set


async def test_create_excel(tool: DocumentTool) -> None:
    """format=excel creates a .xlsx from JSON input."""
    result = await tool.execute(
        format="excel",
        content='{"Data": [["A", "B"], ["1", "2"]]}',
        conversation_id="conv-1",
    )
    assert result["success"] is True
    assert result["document"]["format"] == "excel"
    assert result["document"]["filename"].endswith(".xlsx")


async def test_create_pdf(tool: DocumentTool) -> None:
    """format=pdf creates a .pdf from Markdown."""
    result = await tool.execute(
        format="pdf",
        content="# PDF Title\n\nParagraph text.\n",
        conversation_id="conv-1",
    )
    assert result["success"] is True
    assert result["document"]["format"] == "pdf"
    assert result["document"]["filename"].endswith(".pdf")


async def test_create_with_filename(tool: DocumentTool) -> None:
    """Custom filename is used in the document metadata."""
    result = await tool.execute(
        format="word",
        content="# Test",
        conversation_id="conv-1",
        filename="my-report.docx",
    )
    assert result["success"] is True
    assert result["document"]["filename"] == "my-report.docx"


# ---------------------------------------------------------------------------
# error paths
# ---------------------------------------------------------------------------


async def test_missing_format(tool: DocumentTool) -> None:
    """Missing format returns success=False."""
    result = await tool.execute(
        content="# Test",
        conversation_id="conv-1",
    )
    assert result["success"] is False
    assert "format" in result["error"]


async def test_missing_conversation_id(tool: DocumentTool) -> None:
    """Missing conversation_id returns success=False."""
    result = await tool.execute(
        format="word",
        content="# Test",
    )
    assert result["success"] is False
    assert "conversation_id" in result["error"]


async def test_missing_content(tool: DocumentTool) -> None:
    """Missing content returns success=False."""
    result = await tool.execute(
        format="word",
        content="",
        conversation_id="conv-1",
    )
    assert result["success"] is False
    assert "content" in result["error"]


async def test_invalid_format(tool: DocumentTool) -> None:
    """Unsupported format returns success=False."""
    result = await tool.execute(
        format="pptx",
        content="# Test",
        conversation_id="conv-1",
    )
    assert result["success"] is False


# ---------------------------------------------------------------------------
# tool registration
# ---------------------------------------------------------------------------


def test_tool_name_and_schema(tool: DocumentTool) -> None:
    """Tool has correct name and input_schema."""
    assert tool.name == "document"
    schema = tool.input_schema
    assert schema["type"] == "object"
    assert "action" in schema["properties"]
    assert "format" in schema["properties"]
    assert "content" in schema["properties"]
    assert "conversation_id" in schema["properties"]
    assert "filename" in schema["properties"]
    # U9: conversation_id is the only hard-required field; action defaults to "create"
    assert "conversation_id" in schema["required"]
    assert schema["properties"]["action"]["enum"] == ["create", "read"]


async def test_created_document_persisted(tool: DocumentTool, service: DocumentService) -> None:
    """Created document is persisted and retrievable via service."""
    result = await tool.execute(
        format="word",
        content="# Persisted",
        conversation_id="conv-persist",
    )
    assert result["success"] is True
    doc_id = result["document"]["id"]

    # Retrieve via service
    docs = await service.get_conversation_documents("conv-persist")
    assert len(docs) == 1
    assert docs[0].id == doc_id

    # Retrieve single doc
    doc = await service.get_document(doc_id)
    assert doc is not None
    assert doc.filename == result["document"]["filename"]


# ---------------------------------------------------------------------------
# read action (U9)
# ---------------------------------------------------------------------------


async def test_read_text_file(tool: DocumentTool, tmp_path: Path) -> None:
    """action='read' extracts text from a .txt file."""
    f = tmp_path / "notes.txt"
    f.write_text("Hello world\nLine two", encoding="utf-8")

    result = await tool.execute(action="read", filename=str(f), conversation_id="conv-1")
    assert result["success"] is True
    assert "Hello world" in result["content"]
    assert result["metadata"]["format"] == "text"


async def test_read_markdown_file(tool: DocumentTool, tmp_path: Path) -> None:
    """action='read' extracts text from a .md file, preserving content."""
    f = tmp_path / "doc.md"
    f.write_text("# Title\n\nParagraph.\n", encoding="utf-8")

    result = await tool.execute(action="read", filename=str(f), conversation_id="conv-1")
    assert result["success"] is True
    assert "# Title" in result["content"]
    assert result["metadata"]["format"] == "markdown"
    assert result["title"] == "Title"


async def test_read_word_file(tool: DocumentTool, tmp_path: Path) -> None:
    """action='read' extracts text from a .docx file created by the tool itself."""
    # First create a docx
    create_result = await tool.execute(
        action="create",
        format="word",
        content="# Read Test\n\nContent for reading.",
        conversation_id="conv-1",
        filename="read-test.docx",
    )
    assert create_result["success"] is True

    # The file is stored in service's upload_dir — find it via service
    doc_id = create_result["document"]["id"]
    # ponytail: use service.get_download_path to locate the file on disk
    svc = tool._service  # type: ignore[attr-defined]
    path = svc.get_download_path(doc_id)
    assert path is not None and path.exists()

    result = await tool.execute(action="read", filename=str(path), conversation_id="conv-1")
    assert result["success"] is True
    assert "Read Test" in result["content"]
    assert "Content for reading" in result["content"]
    assert result["metadata"]["format"] == "docx"


async def test_read_excel_file(tool: DocumentTool, tmp_path: Path) -> None:
    """action='read' extracts text from a .xlsx file created by the tool itself."""
    create_result = await tool.execute(
        action="create",
        format="excel",
        content='{"Sheet1": [["Name", "Age"], ["Alice", "30"], ["Bob", "25"]]}',
        conversation_id="conv-1",
        filename="read-test.xlsx",
    )
    assert create_result["success"] is True

    doc_id = create_result["document"]["id"]
    svc = tool._service  # type: ignore[attr-defined]
    path = svc.get_download_path(doc_id)
    assert path is not None and path.exists()

    result = await tool.execute(action="read", filename=str(path), conversation_id="conv-1")
    assert result["success"] is True
    assert "Alice" in result["content"]
    assert "Bob" in result["content"]
    assert result["metadata"]["format"] == "xlsx"
    assert result["metadata"]["sheet_count"] >= 1


async def test_read_missing_file(tool: DocumentTool, tmp_path: Path) -> None:
    """action='read' with non-existent file returns success=False."""
    result = await tool.execute(
        action="read",
        filename=str(tmp_path / "nonexistent.txt"),
        conversation_id="conv-1",
    )
    assert result["success"] is False
    assert "not found" in result["error"].lower() or "no such file" in result["error"].lower()


async def test_read_missing_filename(tool: DocumentTool) -> None:
    """action='read' without filename returns success=False."""
    result = await tool.execute(action="read", conversation_id="conv-1")
    assert result["success"] is False
    assert "filename" in result["error"].lower()


async def test_read_uses_content_as_path_fallback(tool: DocumentTool, tmp_path: Path) -> None:
    """action='read' falls back to 'content' as file path when filename is absent."""
    f = tmp_path / "via-content.txt"
    f.write_text("content-as-path", encoding="utf-8")

    result = await tool.execute(
        action="read",
        content=str(f),
        conversation_id="conv-1",
    )
    assert result["success"] is True
    assert "content-as-path" in result["content"]


async def test_unknown_action(tool: DocumentTool) -> None:
    """Unknown action returns success=False."""
    result = await tool.execute(action="delete", conversation_id="conv-1")
    assert result["success"] is False
    assert "unknown action" in result["error"].lower()


async def test_create_action_explicit(tool: DocumentTool) -> None:
    """action='create' explicitly works the same as default."""
    result = await tool.execute(
        action="create",
        format="word",
        content="# Explicit",
        conversation_id="conv-1",
    )
    assert result["success"] is True
    assert result["document"]["format"] == "word"


# ---------------------------------------------------------------------------
# DocumentLoader Excel support (U9)
# ---------------------------------------------------------------------------


def test_loader_detects_xlsx() -> None:
    """DocumentLoader detects .xlsx and .xls as xlsx format."""
    from agentkit.memory.document_loader import _detect_format

    assert _detect_format("data.xlsx") == "xlsx"
    assert _detect_format("data.XLS") == "xlsx"
    assert _detect_format("data.xls") == "xlsx"


def test_loader_parses_xlsx(tmp_path: Path) -> None:
    """DocumentLoader._parse_xlsx extracts sheet data as Markdown table."""
    import openpyxl

    f = tmp_path / "test.xlsx"
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "Data"
    ws.append(["Name", "Age"])
    ws.append(["Alice", 30])
    ws.append(["Bob", 25])
    wb.save(f)
    wb.close()

    loader = DocumentLoader()
    doc = loader.load(f)
    assert "Alice" in doc.content
    assert "Bob" in doc.content
    assert "Name" in doc.content
    assert doc.metadata["format"] == "xlsx"
    assert doc.metadata["sheet_count"] == 1
    assert doc.metadata["row_count"] == 3
    # Markdown table separator should be present
    assert "---" in doc.content


def test_loader_parses_xlsx_multiple_sheets(tmp_path: Path) -> None:
    """DocumentLoader handles multiple sheets, each as a separate H2 section."""
    import openpyxl

    f = tmp_path / "multi.xlsx"
    wb = openpyxl.Workbook()
    ws1 = wb.active
    ws1.title = "Sheet1"
    ws1.append(["A", "B"])
    ws1.append(["1", "2"])
    ws2 = wb.create_sheet("Sheet2")
    ws2.append(["C", "D"])
    ws2.append(["3", "4"])
    wb.save(f)
    wb.close()

    loader = DocumentLoader()
    doc = loader.load(f)
    assert "## Sheet1" in doc.content
    assert "## Sheet2" in doc.content
    assert doc.metadata["sheet_count"] == 2


def test_loader_parses_xlsx_empty_cells(tmp_path: Path) -> None:
    """DocumentLoader handles empty cells gracefully (renders as empty string)."""
    import openpyxl

    f = tmp_path / "empty.xlsx"
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.append(["A", "B", "C"])
    ws.append(["x", None, "z"])
    wb.save(f)
    wb.close()

    loader = DocumentLoader()
    doc = loader.load(f)
    # Empty cell should not crash; row should still have 3 columns
    assert "x" in doc.content
    assert "z" in doc.content