fischer-agentkit/src/agentkit/documents/renderers/word_renderer.py

"""Word (.docx) renderer — Markdown → python-docx.

Line-based Markdown parser mapping to python-docx objects. Supports:
- Headings (# H1 .. ###### H6)
- Bullet lists (- / * / +)
- Numbered lists (1. / 2.)
- GFM tables (| col | col |)
- Bold (**text**) and italic (*text_) inline formatting
- Plain paragraphs

Unsupported Markdown features (images, code blocks, blockquotes) fall back
to plain text — v1 scope per plan U2.
"""

from __future__ import annotations

import re
from pathlib import Path

from docx import Document
from docx.table import Table
from docx.text.paragraph import Paragraph


class WordRenderer:
    """Render Markdown content into a .docx file via python-docx."""

    def render(self, markdown_content: str, output_path: Path) -> Path:
        """Render Markdown to a .docx file at output_path.

        Args:
            markdown_content: Markdown-formatted text.
            output_path: Destination .docx path.

        Returns:
            The output_path (for chaining).
        """
        doc = Document()
        lines = markdown_content.splitlines()
        i = 0
        while i < len(lines):
            line = lines[i]
            i += 1

            # Skip empty lines
            if not line.strip():
                continue

            # Heading: # .. ######
            heading_match = re.match(r"^(#{1,6})\s+(.+)$", line)
            if heading_match:
                level = len(heading_match.group(1))
                text = heading_match.group(2).strip()
                doc.add_heading(text, level=level)
                continue

            # GFM table: line starts with | and next line is a separator
            if line.lstrip().startswith("|") and i < len(lines) and re.match(
                r"^\s*\|[\s:|-]+\|\s*$", lines[i]
            ):
                # Collect table rows: header, separator, data rows
                table_lines = [line, lines[i]]
                i += 1
                while i < len(lines) and lines[i].lstrip().startswith("|"):
                    table_lines.append(lines[i])
                    i += 1
                self._add_table(doc, table_lines)
                continue

            # Bullet list: - / * / +
            bullet_match = re.match(r"^(\s*)[-*+]\s+(.+)$", line)
            if bullet_match:
                text = bullet_match.group(2)
                para = doc.add_paragraph(style="List Bullet")
                self._add_inline_runs(para, text)
                continue

            # Numbered list: 1. / 2. etc.
            num_match = re.match(r"^(\s*)\d+\.\s+(.+)$", line)
            if num_match:
                text = num_match.group(2)
                para = doc.add_paragraph(style="List Number")
                self._add_inline_runs(para, text)
                continue

            # Plain paragraph
            para = doc.add_paragraph()
            self._add_inline_runs(para, line)

        output_path.parent.mkdir(parents=True, exist_ok=True)
        doc.save(str(output_path))
        return output_path

    def _add_table(self, doc: Document, table_lines: list[str]) -> Table:
        """Parse GFM table lines and add a python-docx table."""
        rows: list[list[str]] = []
        for idx, line in enumerate(table_lines):
            # Skip the separator row (|---|---|)
            if idx == 1:
                continue
            # Split by | and strip edges
            cells = [c.strip() for c in line.strip().strip("|").split("|")]
            rows.append(cells)

        if not rows:
            return doc.add_table(rows=0, cols=0)

        ncols = max(len(r) for r in rows)
        table = doc.add_table(rows=len(rows), cols=ncols)
        table.style = "Table Grid"
        for r_idx, row in enumerate(rows):
            for c_idx, cell_text in enumerate(row):
                if c_idx < ncols:
                    cell = table.cell(r_idx, c_idx)
                    cell.text = cell_text
        return table

    def _add_inline_runs(self, para: Paragraph, text: str) -> None:
        """Add runs with bold/italic inline formatting.

        Supports **bold** and *italic*. Nested formatting is not supported
        in v1 — the first match wins.
        """
        # Pattern: **bold** or *italic*
        pattern = re.compile(r"(\*\*(.+?)\*\*|\*(.+?)\*)")
        pos = 0
        for match in pattern.finditer(text):
            # Add preceding plain text
            if match.start() > pos:
                para.add_run(text[pos : match.start()])
            if match.group(2):  # **bold**
                run = para.add_run(match.group(2))
                run.bold = True
            elif match.group(3):  # *italic*
                run = para.add_run(match.group(3))
                run.italic = True
            pos = match.end()
        # Add trailing plain text
        if pos < len(text):
            para.add_run(text[pos:])