fischer-agentkit/src/agentkit/documents/renderers/word_renderer.py

141 lines
4.8 KiB
Python

"""Word (.docx) renderer — Markdown → python-docx.
Line-based Markdown parser mapping to python-docx objects. Supports:
- Headings (# H1 .. ###### H6)
- Bullet lists (- / * / +)
- Numbered lists (1. / 2.)
- GFM tables (| col | col |)
- Bold (**text**) and italic (*text_) inline formatting
- Plain paragraphs
Unsupported Markdown features (images, code blocks, blockquotes) fall back
to plain text — v1 scope per plan U2.
"""
from __future__ import annotations
import re
from pathlib import Path
from docx import Document
from docx.table import Table
from docx.text.paragraph import Paragraph
class WordRenderer:
"""Render Markdown content into a .docx file via python-docx."""
def render(self, markdown_content: str, output_path: Path) -> Path:
"""Render Markdown to a .docx file at output_path.
Args:
markdown_content: Markdown-formatted text.
output_path: Destination .docx path.
Returns:
The output_path (for chaining).
"""
doc = Document()
lines = markdown_content.splitlines()
i = 0
while i < len(lines):
line = lines[i]
i += 1
# Skip empty lines
if not line.strip():
continue
# Heading: # .. ######
heading_match = re.match(r"^(#{1,6})\s+(.+)$", line)
if heading_match:
level = len(heading_match.group(1))
text = heading_match.group(2).strip()
doc.add_heading(text, level=level)
continue
# GFM table: line starts with | and next line is a separator
if line.lstrip().startswith("|") and i < len(lines) and re.match(
r"^\s*\|[\s:|-]+\|\s*$", lines[i]
):
# Collect table rows: header, separator, data rows
table_lines = [line, lines[i]]
i += 1
while i < len(lines) and lines[i].lstrip().startswith("|"):
table_lines.append(lines[i])
i += 1
self._add_table(doc, table_lines)
continue
# Bullet list: - / * / +
bullet_match = re.match(r"^(\s*)[-*+]\s+(.+)$", line)
if bullet_match:
text = bullet_match.group(2)
para = doc.add_paragraph(style="List Bullet")
self._add_inline_runs(para, text)
continue
# Numbered list: 1. / 2. etc.
num_match = re.match(r"^(\s*)\d+\.\s+(.+)$", line)
if num_match:
text = num_match.group(2)
para = doc.add_paragraph(style="List Number")
self._add_inline_runs(para, text)
continue
# Plain paragraph
para = doc.add_paragraph()
self._add_inline_runs(para, line)
output_path.parent.mkdir(parents=True, exist_ok=True)
doc.save(str(output_path))
return output_path
def _add_table(self, doc: Document, table_lines: list[str]) -> Table:
"""Parse GFM table lines and add a python-docx table."""
rows: list[list[str]] = []
for idx, line in enumerate(table_lines):
# Skip the separator row (|---|---|)
if idx == 1:
continue
# Split by | and strip edges
cells = [c.strip() for c in line.strip().strip("|").split("|")]
rows.append(cells)
if not rows:
return doc.add_table(rows=0, cols=0)
ncols = max(len(r) for r in rows)
table = doc.add_table(rows=len(rows), cols=ncols)
table.style = "Table Grid"
for r_idx, row in enumerate(rows):
for c_idx, cell_text in enumerate(row):
if c_idx < ncols:
cell = table.cell(r_idx, c_idx)
cell.text = cell_text
return table
def _add_inline_runs(self, para: Paragraph, text: str) -> None:
"""Add runs with bold/italic inline formatting.
Supports **bold** and *italic*. Nested formatting is not supported
in v1 — the first match wins.
"""
# Pattern: **bold** or *italic*
pattern = re.compile(r"(\*\*(.+?)\*\*|\*(.+?)\*)")
pos = 0
for match in pattern.finditer(text):
# Add preceding plain text
if match.start() > pos:
para.add_run(text[pos : match.start()])
if match.group(2): # **bold**
run = para.add_run(match.group(2))
run.bold = True
elif match.group(3): # *italic*
run = para.add_run(match.group(3))
run.italic = True
pos = match.end()
# Add trailing plain text
if pos < len(text):
para.add_run(text[pos:])