geo/backend/app/services/content/html_generator.py

import re
from typing import Optional


class HTMLGenerator:
    """HTML生成器 - 根据平台规则生成适配HTML"""

    def generate(self, content: str, platform: str, format: str = "html") -> str:
        """根据平台规则生成HTML

        Args:
            content: HTML内容
            platform: 平台标识
            format: 输出格式 (html/markdown/plain)

        Returns:
            处理后的内容
        """
        from app.services.distribution.platform_rules import PLATFORM_RULES

        rules = PLATFORM_RULES.get(platform, {})
        html_rules = rules.get("html_rules", {})

        # 获取平台支持的标签和禁用标签
        banned_tags = html_rules.get("banned_tags", [])

        result = content

        # 移除禁用的标签及其内容
        for tag in banned_tags:
            # 移除带内容的标签
            result = re.sub(
                f"<{tag}[^>]*>.*?</{tag}>", "", result, flags=re.DOTALL | re.IGNORECASE
            )
            # 移除自闭合标签
            result = re.sub(f"<{tag}[^>]*/?>", "", result, flags=re.IGNORECASE)

        # 平台特定处理
        if platform == "wechat":
            # 微信公众号：移除外部链接
            result = re.sub(
                r"<a[^>]*href=['\"]https?://(?!mp\.weixin\.qq\.com)[^'\"]*['\"][^>]*>",
                "",
                result,
                flags=re.IGNORECASE,
            )
            # 移除链接文本但保留内部内容
            result = re.sub(
                r"</a>", "", result, flags=re.IGNORECASE
            )

        if format == "markdown":
            return self.to_markdown(result)
        elif format == "plain":
            return self.to_plain(result)

        return result

    def to_markdown(self, content: str) -> str:
        """HTML转Markdown

        Args:
            content: HTML内容

        Returns:
            Markdown格式内容
        """
        # h1 -> #
        content = re.sub(r"<h1[^>]*>(.*?)</h1>", r"# \1", content, flags=re.IGNORECASE)
        # h2 -> ##
        content = re.sub(r"<h2[^>]*>(.*?)</h2>", r"## \1", content, flags=re.IGNORECASE)
        # h3 -> ###
        content = re.sub(r"<h3[^>]*>(.*?)</h3>", r"### \1", content, flags=re.IGNORECASE)
        # h4 -> ####
        content = re.sub(r"<h4[^>]*>(.*?)</h4>", r"#### \1", content, flags=re.IGNORECASE)
        # p -> 段落
        content = re.sub(r"<p[^>]*>(.*?)</p>", r"\1\n\n", content, flags=re.IGNORECASE)
        # br -> 换行
        content = re.sub(r"<br[^>]*/?>", r"\n", content, flags=re.IGNORECASE)
        # ul/ol -> 列表
        content = re.sub(r"<li[^>]*>(.*?)</li>", r"- \1", content, flags=re.IGNORECASE)
        # blockquote
        content = re.sub(r"<blockquote[^>]*>(.*?)</blockquote>", r"> \1", content, flags=re.IGNORECASE | re.DOTALL)
        # code inline
        content = re.sub(r"<code[^>]*>(.*?)</code>", r"`\1`", content, flags=re.IGNORECASE)
        # pre
        content = re.sub(r"<pre[^>]*>(.*?)</pre>", r"```\n\1\n```", content, flags=re.IGNORECASE | re.DOTALL)
        # 清理残留标签
        content = re.sub(r"<[^>]+>", "", content)
        # 清理多余空行
        content = re.sub(r"\n{3,}", r"\n\n", content)

        return content.strip()

    def to_plain(self, content: str) -> str:
        """HTML转纯文本

        Args:
            content: HTML内容

        Returns:
            纯文本内容
        """
        # 移除所有HTML标签
        text = re.sub(r"<[^>]+>", "", content)
        # 解码HTML实体
        text = text.replace("&nbsp;", " ")
        text = text.replace("&lt;", "<")
        text = text.replace("&gt;", ">")
        text = text.replace("&amp;", "&")
        text = text.replace("&quot;", '"')
        text = text.replace("&#39;", "'")
        # 清理多余空格
        text = re.sub(r" {2,}", " ", text)
        # 清理多余换行
        text = re.sub(r"\n{3,}", r"\n\n", text)

        return text.strip()