141 lines
4.8 KiB
Python
141 lines
4.8 KiB
Python
"""Word (.docx) renderer — Markdown → python-docx.
|
|
|
|
Line-based Markdown parser mapping to python-docx objects. Supports:
|
|
- Headings (# H1 .. ###### H6)
|
|
- Bullet lists (- / * / +)
|
|
- Numbered lists (1. / 2.)
|
|
- GFM tables (| col | col |)
|
|
- Bold (**text**) and italic (*text_) inline formatting
|
|
- Plain paragraphs
|
|
|
|
Unsupported Markdown features (images, code blocks, blockquotes) fall back
|
|
to plain text — v1 scope per plan U2.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from pathlib import Path
|
|
|
|
from docx import Document
|
|
from docx.table import Table
|
|
from docx.text.paragraph import Paragraph
|
|
|
|
|
|
class WordRenderer:
|
|
"""Render Markdown content into a .docx file via python-docx."""
|
|
|
|
def render(self, markdown_content: str, output_path: Path) -> Path:
|
|
"""Render Markdown to a .docx file at output_path.
|
|
|
|
Args:
|
|
markdown_content: Markdown-formatted text.
|
|
output_path: Destination .docx path.
|
|
|
|
Returns:
|
|
The output_path (for chaining).
|
|
"""
|
|
doc = Document()
|
|
lines = markdown_content.splitlines()
|
|
i = 0
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
i += 1
|
|
|
|
# Skip empty lines
|
|
if not line.strip():
|
|
continue
|
|
|
|
# Heading: # .. ######
|
|
heading_match = re.match(r"^(#{1,6})\s+(.+)$", line)
|
|
if heading_match:
|
|
level = len(heading_match.group(1))
|
|
text = heading_match.group(2).strip()
|
|
doc.add_heading(text, level=level)
|
|
continue
|
|
|
|
# GFM table: line starts with | and next line is a separator
|
|
if line.lstrip().startswith("|") and i < len(lines) and re.match(
|
|
r"^\s*\|[\s:|-]+\|\s*$", lines[i]
|
|
):
|
|
# Collect table rows: header, separator, data rows
|
|
table_lines = [line, lines[i]]
|
|
i += 1
|
|
while i < len(lines) and lines[i].lstrip().startswith("|"):
|
|
table_lines.append(lines[i])
|
|
i += 1
|
|
self._add_table(doc, table_lines)
|
|
continue
|
|
|
|
# Bullet list: - / * / +
|
|
bullet_match = re.match(r"^(\s*)[-*+]\s+(.+)$", line)
|
|
if bullet_match:
|
|
text = bullet_match.group(2)
|
|
para = doc.add_paragraph(style="List Bullet")
|
|
self._add_inline_runs(para, text)
|
|
continue
|
|
|
|
# Numbered list: 1. / 2. etc.
|
|
num_match = re.match(r"^(\s*)\d+\.\s+(.+)$", line)
|
|
if num_match:
|
|
text = num_match.group(2)
|
|
para = doc.add_paragraph(style="List Number")
|
|
self._add_inline_runs(para, text)
|
|
continue
|
|
|
|
# Plain paragraph
|
|
para = doc.add_paragraph()
|
|
self._add_inline_runs(para, line)
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
doc.save(str(output_path))
|
|
return output_path
|
|
|
|
def _add_table(self, doc: Document, table_lines: list[str]) -> Table:
|
|
"""Parse GFM table lines and add a python-docx table."""
|
|
rows: list[list[str]] = []
|
|
for idx, line in enumerate(table_lines):
|
|
# Skip the separator row (|---|---|)
|
|
if idx == 1:
|
|
continue
|
|
# Split by | and strip edges
|
|
cells = [c.strip() for c in line.strip().strip("|").split("|")]
|
|
rows.append(cells)
|
|
|
|
if not rows:
|
|
return doc.add_table(rows=0, cols=0)
|
|
|
|
ncols = max(len(r) for r in rows)
|
|
table = doc.add_table(rows=len(rows), cols=ncols)
|
|
table.style = "Table Grid"
|
|
for r_idx, row in enumerate(rows):
|
|
for c_idx, cell_text in enumerate(row):
|
|
if c_idx < ncols:
|
|
cell = table.cell(r_idx, c_idx)
|
|
cell.text = cell_text
|
|
return table
|
|
|
|
def _add_inline_runs(self, para: Paragraph, text: str) -> None:
|
|
"""Add runs with bold/italic inline formatting.
|
|
|
|
Supports **bold** and *italic*. Nested formatting is not supported
|
|
in v1 — the first match wins.
|
|
"""
|
|
# Pattern: **bold** or *italic*
|
|
pattern = re.compile(r"(\*\*(.+?)\*\*|\*(.+?)\*)")
|
|
pos = 0
|
|
for match in pattern.finditer(text):
|
|
# Add preceding plain text
|
|
if match.start() > pos:
|
|
para.add_run(text[pos : match.start()])
|
|
if match.group(2): # **bold**
|
|
run = para.add_run(match.group(2))
|
|
run.bold = True
|
|
elif match.group(3): # *italic*
|
|
run = para.add_run(match.group(3))
|
|
run.italic = True
|
|
pos = match.end()
|
|
# Add trailing plain text
|
|
if pos < len(text):
|
|
para.add_run(text[pos:])
|