fischer-agentkit/tests/integration/test_document_e2e.py

"""End-to-end integration tests for document processing (F1, F2, F3).

Verifies complete user flows:
- F1: Create document → List → Download → Verify content
- F2: Upload template → Create with template → Download → Verify variables replaced
- F3: Cross-conversation isolation
"""

from __future__ import annotations

import asyncio
import io
from pathlib import Path

import pytest
from docx import Document as DocxDocument
from fastapi import FastAPI
from fastapi.testclient import TestClient
from openpyxl import load_workbook

from agentkit.documents.db import init_documents_db
from agentkit.documents.renderers.excel_renderer import ExcelRenderer
from agentkit.documents.renderers.pdf_renderer import PDFRenderer
from agentkit.documents.renderers.word_renderer import WordRenderer
from agentkit.documents.service import DocumentService
from agentkit.server.routes import documents as documents_routes


@pytest.fixture
def app(tmp_path: Path) -> FastAPI:
    """Test app with all renderers registered.

    After Bug 2 fix, TemplateRenderer is lazy-loaded by DocumentService
    when template_path is provided — no need to register it separately.
    """
    db_path = tmp_path / "test.db"
    upload_dir = tmp_path / "uploads"
    asyncio.run(init_documents_db(db_path))

    service = DocumentService(upload_dir=upload_dir, db_path=db_path)
    service.register_renderer("word", WordRenderer())
    service.register_renderer("excel", ExcelRenderer())
    service.register_renderer("pdf", PDFRenderer())

    app = FastAPI()
    app.state.document_service = service
    app.state.server_config = None  # No auth for E2E tests
    app.include_router(documents_routes.router, prefix="/api/v1")
    return app


@pytest.fixture
def client(app: FastAPI) -> TestClient:
    return TestClient(app)


# ---------------------------------------------------------------------------
# F1: Create → List → Download complete flow
# ---------------------------------------------------------------------------


class TestF1CreateListDownload:
    """F1: User creates a document, sees it in the list, downloads it."""

    def test_e2e_word_create_list_download(self, client: TestClient) -> None:
        """Word: create → list contains it → download content matches."""
        # Step 1: Create
        create_resp = client.post(
            "/api/v1/documents/create",
            json={
                "format": "word",
                "content": "# E2E Report\n\nThis is the report content.",
                "conversation_id": "conv-e2e-1",
            },
        )
        assert create_resp.status_code == 200
        doc = create_resp.json()["document"]
        doc_id = doc["id"]
        assert doc["format"] == "word"
        assert doc["filename"].endswith(".docx")
        assert doc["size"] > 0

        # Step 2: List — document appears in conversation
        list_resp = client.get("/api/v1/documents/conversation/conv-e2e-1")
        assert list_resp.status_code == 200
        docs = list_resp.json()["documents"]
        assert len(docs) == 1
        assert docs[0]["id"] == doc_id
        assert docs[0]["download_url"] == f"/api/v1/documents/download/{doc_id}"

        # Step 3: Download — file content is valid
        dl_resp = client.get(f"/api/v1/documents/download/{doc_id}")
        assert dl_resp.status_code == 200
        assert len(dl_resp.content) == doc["size"]

        # Step 4: Verify downloaded file is a valid .docx with correct content
        docx = DocxDocument(io.BytesIO(dl_resp.content))
        text = "\n".join(p.text for p in docx.paragraphs)
        assert "E2E Report" in text
        assert "This is the report content" in text

    def test_e2e_excel_create_list_download(self, client: TestClient) -> None:
        """Excel: create → list → download → verify cell content."""
        create_resp = client.post(
            "/api/v1/documents/create",
            json={
                "format": "excel",
                "content": '{"Sales": [["Product", "Revenue"], ["Widget", "1000"], ["Gadget", "2000"]]}',
                "conversation_id": "conv-e2e-2",
            },
        )
        assert create_resp.status_code == 200
        doc_id = create_resp.json()["document"]["id"]

        # List
        list_resp = client.get("/api/v1/documents/conversation/conv-e2e-2")
        assert list_resp.json()["count"] == 1

        # Download and verify
        dl_resp = client.get(f"/api/v1/documents/download/{doc_id}")
        assert dl_resp.status_code == 200

        wb = load_workbook(io.BytesIO(dl_resp.content))
        ws = wb["Sales"]
        assert ws["A1"].value == "Product"
        assert ws["B1"].value == "Revenue"
        assert ws["A2"].value == "Widget"
        assert ws["B2"].value == "1000"
        wb.close()

    def test_e2e_pdf_create_list_download(self, client: TestClient) -> None:
        """PDF: create → list → download → verify PDF magic bytes."""
        create_resp = client.post(
            "/api/v1/documents/create",
            json={
                "format": "pdf",
                "content": "# PDF Report\n\nContent here.",
                "conversation_id": "conv-e2e-3",
            },
        )
        assert create_resp.status_code == 200
        doc_id = create_resp.json()["document"]["id"]

        # List
        list_resp = client.get("/api/v1/documents/conversation/conv-e2e-3")
        assert list_resp.json()["count"] == 1

        # Download and verify PDF magic
        dl_resp = client.get(f"/api/v1/documents/download/{doc_id}")
        assert dl_resp.status_code == 200
        assert dl_resp.content[:4] == b"%PDF"

    def test_e2e_multiple_documents_same_conversation(self, client: TestClient) -> None:
        """Multiple documents in same conversation — list shows all, ordered."""
        conv_id = "conv-multi"

        # Create 3 documents
        for i, fmt in enumerate(["word", "excel", "pdf"]):
            resp = client.post(
                "/api/v1/documents/create",
                json={
                    "format": fmt,
                    "content": f"# Doc {i}",
                    "conversation_id": conv_id,
                },
            )
            assert resp.status_code == 200

        # List — all 3 present
        list_resp = client.get(f"/api/v1/documents/conversation/{conv_id}")
        assert list_resp.status_code == 200
        data = list_resp.json()
        assert data["count"] == 3

        formats = [d["format"] for d in data["documents"]]
        assert set(formats) == {"word", "excel", "pdf"}

        # Each has a unique download URL
        urls = [d["download_url"] for d in data["documents"]]
        assert len(set(urls)) == 3

    def test_e2e_download_returns_correct_filename(self, client: TestClient) -> None:
        """Download response includes the original filename in Content-Disposition."""
        create_resp = client.post(
            "/api/v1/documents/create",
            json={
                "format": "word",
                "content": "# Test",
                "conversation_id": "conv-fn",
                "filename": "my-report.docx",
            },
        )
        doc_id = create_resp.json()["document"]["id"]

        dl_resp = client.get(f"/api/v1/documents/download/{doc_id}")
        assert dl_resp.status_code == 200
        # FileResponse sets filename in Content-Disposition
        assert "my-report.docx" in dl_resp.headers.get("content-disposition", "")


# ---------------------------------------------------------------------------
# F2: Template upload → create with template → download
# ---------------------------------------------------------------------------


class TestF2TemplateWorkflow:
    """F2: Upload template → Create with template → Download → Verify variables.

    After Bug 2 fix, template filling works with the standard WordRenderer
    registration — DocumentService lazy-loads TemplateRenderer internally.
    """

    def test_e2e_upload_template_create_download(
        self, client: TestClient, tmp_path: Path
    ) -> None:
        """Complete template workflow: upload → fill → download → verify."""
        # Step 1: Create a .docx template with Jinja2 placeholders
        template_doc = DocxDocument()
        template_doc.add_heading("Invoice {{invoice_number}}", level=1)
        template_doc.add_paragraph("Customer: {{customer_name}}")
        template_doc.add_paragraph("Amount: ${{amount}}")
        template_path = tmp_path / "invoice_template.docx"
        template_doc.save(str(template_path))

        # Step 2: Upload the template
        with open(template_path, "rb") as f:
            upload_resp = client.post(
                "/api/v1/documents/upload-template",
                files={"file": ("invoice_template.docx", f, "application/octet-stream")},
            )
        assert upload_resp.status_code == 200
        stored_name = upload_resp.json()["stored_name"]

        # Step 3: Create document using the template
        create_resp = client.post(
            "/api/v1/documents/create",
            json={
                "format": "word",
                "content": "",  # Ignored when template is provided
                "conversation_id": "conv-template",
                "template": stored_name,
                "template_data": {
                    "invoice_number": "INV-2026-001",
                    "customer_name": "Acme Corp",
                    "amount": "1,234.56",
                },
            },
        )
        assert create_resp.status_code == 200, create_resp.text
        doc_id = create_resp.json()["document"]["id"]

        # Step 4: Download and verify variables were replaced
        dl_resp = client.get(f"/api/v1/documents/download/{doc_id}")
        assert dl_resp.status_code == 200

        docx = DocxDocument(io.BytesIO(dl_resp.content))
        text = "\n".join(p.text for p in docx.paragraphs)
        assert "INV-2026-001" in text
        assert "Acme Corp" in text
        assert "1,234.56" in text
        # Placeholders should be gone
        assert "{{" not in text
        assert "}}" not in text

    def test_e2e_template_with_loop(
        self, client: TestClient, tmp_path: Path
    ) -> None:
        """Template with {% for %} loop — verify loop expands correctly."""
        template_doc = DocxDocument()
        template_doc.add_heading("Shopping List", level=1)
        # ponytail: docxtpl uses {%p %} for paragraph-level loops, {% %} for inline
        template_doc.add_paragraph("{%p for item in items %}")
        template_doc.add_paragraph("- {{item}}")
        template_doc.add_paragraph("{%p endfor %}")
        template_path = tmp_path / "loop_template.docx"
        template_doc.save(str(template_path))

        with open(template_path, "rb") as f:
            upload_resp = client.post(
                "/api/v1/documents/upload-template",
                files={"file": ("loop_template.docx", f, "application/octet-stream")},
            )
        stored_name = upload_resp.json()["stored_name"]

        create_resp = client.post(
            "/api/v1/documents/create",
            json={
                "format": "word",
                "content": "",
                "conversation_id": "conv-loop",
                "template": stored_name,
                "template_data": {
                    "items": ["Apple", "Banana", "Cherry"],
                },
            },
        )
        assert create_resp.status_code == 200, create_resp.text
        doc_id = create_resp.json()["document"]["id"]

        dl_resp = client.get(f"/api/v1/documents/download/{doc_id}")
        assert dl_resp.status_code == 200

        docx = DocxDocument(io.BytesIO(dl_resp.content))
        text = "\n".join(p.text for p in docx.paragraphs)
        assert "Apple" in text
        assert "Banana" in text
        assert "Cherry" in text


# ---------------------------------------------------------------------------
# F3: Cross-conversation isolation
# ---------------------------------------------------------------------------


class TestF3ConversationIsolation:
    """F3: Documents from one conversation don't leak to another."""

    def test_e2e_conversation_isolation(self, client: TestClient) -> None:
        """Documents in conv-A don't appear in conv-B's list."""
        # Create in conv-A
        client.post(
            "/api/v1/documents/create",
            json={
                "format": "word",
                "content": "# Conv A Doc",
                "conversation_id": "conv-A",
            },
        )
        # Create in conv-B
        client.post(
            "/api/v1/documents/create",
            json={
                "format": "pdf",
                "content": "# Conv B Doc",
                "conversation_id": "conv-B",
            },
        )

        # List conv-A — only conv-A's doc
        resp_a = client.get("/api/v1/documents/conversation/conv-A")
        docs_a = resp_a.json()["documents"]
        assert len(docs_a) == 1
        assert docs_a[0]["format"] == "word"

        # List conv-B — only conv-B's doc
        resp_b = client.get("/api/v1/documents/conversation/conv-B")
        docs_b = resp_b.json()["documents"]
        assert len(docs_b) == 1
        assert docs_b[0]["format"] == "pdf"

    def test_e2e_download_any_document_by_id(self, client: TestClient) -> None:
        """Download works by doc_id regardless of conversation (no ACL in v1)."""
        # Create in conv-A
        create_resp = client.post(
            "/api/v1/documents/create",
            json={
                "format": "word",
                "content": "# Downloadable",
                "conversation_id": "conv-X",
            },
        )
        doc_id = create_resp.json()["document"]["id"]

        # Download without specifying conversation — works (v1 has no ACL)
        dl_resp = client.get(f"/api/v1/documents/download/{doc_id}")
        assert dl_resp.status_code == 200
        assert len(dl_resp.content) > 0


# ---------------------------------------------------------------------------
# Data consistency checks
# ---------------------------------------------------------------------------


class TestDataConsistency:
    """Verify metadata matches actual files on disk."""

    def test_metadata_size_matches_file(self, client: TestClient) -> None:
        """Document metadata size equals actual file size on disk."""
        create_resp = client.post(
            "/api/v1/documents/create",
            json={
                "format": "word",
                "content": "# Size Check\n\nContent.",
                "conversation_id": "conv-size",
            },
        )
        meta_size = create_resp.json()["document"]["size"]
        doc_id = create_resp.json()["document"]["id"]

        # Download and check actual size
        dl_resp = client.get(f"/api/v1/documents/download/{doc_id}")
        assert len(dl_resp.content) == meta_size

    def test_filename_has_correct_extension(self, client: TestClient) -> None:
        """Each format produces the correct file extension."""
        for fmt, ext in [("word", ".docx"), ("excel", ".xlsx"), ("pdf", ".pdf")]:
            resp = client.post(
                "/api/v1/documents/create",
                json={
                    "format": fmt,
                    "content": "# Test",
                    "conversation_id": f"conv-ext-{fmt}",
                },
            )
            filename = resp.json()["document"]["filename"]
            assert filename.endswith(ext), f"{fmt} should produce {ext}, got {filename}"

    def test_custom_filename_preserved(self, client: TestClient) -> None:
        """Custom filename is preserved in metadata and download."""
        resp = client.post(
            "/api/v1/documents/create",
            json={
                "format": "pdf",
                "content": "# Custom Name",
                "conversation_id": "conv-custom",
                "filename": "quarterly-report.pdf",
            },
        )
        assert resp.json()["document"]["filename"] == "quarterly-report.pdf"

        doc_id = resp.json()["document"]["id"]
        dl_resp = client.get(f"/api/v1/documents/download/{doc_id}")
        assert "quarterly-report.pdf" in dl_resp.headers.get("content-disposition", "")