import re from typing import Optional class HTMLGenerator: """HTML生成器 - 根据平台规则生成适配HTML""" def generate(self, content: str, platform: str, format: str = "html") -> str: """根据平台规则生成HTML Args: content: HTML内容 platform: 平台标识 format: 输出格式 (html/markdown/plain) Returns: 处理后的内容 """ from app.services.distribution.platform_rules import PLATFORM_RULES rules = PLATFORM_RULES.get(platform, {}) html_rules = rules.get("html_rules", {}) # 获取平台支持的标签和禁用标签 banned_tags = html_rules.get("banned_tags", []) result = content # 移除禁用的标签及其内容 for tag in banned_tags: # 移除带内容的标签 result = re.sub( f"<{tag}[^>]*>.*?", "", result, flags=re.DOTALL | re.IGNORECASE ) # 移除自闭合标签 result = re.sub(f"<{tag}[^>]*/?>", "", result, flags=re.IGNORECASE) # 平台特定处理 if platform == "wechat": # 微信公众号:移除外部链接 result = re.sub( r"]*href=['\"]https?://(?!mp\.weixin\.qq\.com)[^'\"]*['\"][^>]*>", "", result, flags=re.IGNORECASE, ) # 移除链接文本但保留内部内容 result = re.sub( r"", "", result, flags=re.IGNORECASE ) if format == "markdown": return self.to_markdown(result) elif format == "plain": return self.to_plain(result) return result def to_markdown(self, content: str) -> str: """HTML转Markdown Args: content: HTML内容 Returns: Markdown格式内容 """ # h1 -> # content = re.sub(r"]*>(.*?)", r"# \1", content, flags=re.IGNORECASE) # h2 -> ## content = re.sub(r"]*>(.*?)", r"## \1", content, flags=re.IGNORECASE) # h3 -> ### content = re.sub(r"]*>(.*?)", r"### \1", content, flags=re.IGNORECASE) # h4 -> #### content = re.sub(r"]*>(.*?)", r"#### \1", content, flags=re.IGNORECASE) # p -> 段落 content = re.sub(r"]*>(.*?)

", r"\1\n\n", content, flags=re.IGNORECASE) # br -> 换行 content = re.sub(r"]*/?>", r"\n", content, flags=re.IGNORECASE) # ul/ol -> 列表 content = re.sub(r"]*>(.*?)", r"- \1", content, flags=re.IGNORECASE) # blockquote content = re.sub(r"]*>(.*?)", r"> \1", content, flags=re.IGNORECASE | re.DOTALL) # code inline content = re.sub(r"]*>(.*?)", r"`\1`", content, flags=re.IGNORECASE) # pre content = re.sub(r"]*>(.*?)", r"```\n\1\n```", content, flags=re.IGNORECASE | re.DOTALL) # 清理残留标签 content = re.sub(r"<[^>]+>", "", content) # 清理多余空行 content = re.sub(r"\n{3,}", r"\n\n", content) return content.strip() def to_plain(self, content: str) -> str: """HTML转纯文本 Args: content: HTML内容 Returns: 纯文本内容 """ # 移除所有HTML标签 text = re.sub(r"<[^>]+>", "", content) # 解码HTML实体 text = text.replace(" ", " ") text = text.replace("<", "<") text = text.replace(">", ">") text = text.replace("&", "&") text = text.replace(""", '"') text = text.replace("'", "'") # 清理多余空格 text = re.sub(r" {2,}", " ", text) # 清理多余换行 text = re.sub(r"\n{3,}", r"\n\n", text) return text.strip()