geo/backend/app/services/content/html_generator.py

119 lines
3.9 KiB
Python

import re
from typing import Optional
class HTMLGenerator:
"""HTML生成器 - 根据平台规则生成适配HTML"""
def generate(self, content: str, platform: str, format: str = "html") -> str:
"""根据平台规则生成HTML
Args:
content: HTML内容
platform: 平台标识
format: 输出格式 (html/markdown/plain)
Returns:
处理后的内容
"""
from app.services.distribution.platform_rules import PLATFORM_RULES
rules = PLATFORM_RULES.get(platform, {})
html_rules = rules.get("html_rules", {})
# 获取平台支持的标签和禁用标签
banned_tags = html_rules.get("banned_tags", [])
result = content
# 移除禁用的标签及其内容
for tag in banned_tags:
# 移除带内容的标签
result = re.sub(
f"<{tag}[^>]*>.*?</{tag}>", "", result, flags=re.DOTALL | re.IGNORECASE
)
# 移除自闭合标签
result = re.sub(f"<{tag}[^>]*/?>", "", result, flags=re.IGNORECASE)
# 平台特定处理
if platform == "wechat":
# 微信公众号:移除外部链接
result = re.sub(
r"<a[^>]*href=['\"]https?://(?!mp\.weixin\.qq\.com)[^'\"]*['\"][^>]*>",
"",
result,
flags=re.IGNORECASE,
)
# 移除链接文本但保留内部内容
result = re.sub(
r"</a>", "", result, flags=re.IGNORECASE
)
if format == "markdown":
return self.to_markdown(result)
elif format == "plain":
return self.to_plain(result)
return result
def to_markdown(self, content: str) -> str:
"""HTML转Markdown
Args:
content: HTML内容
Returns:
Markdown格式内容
"""
# h1 -> #
content = re.sub(r"<h1[^>]*>(.*?)</h1>", r"# \1", content, flags=re.IGNORECASE)
# h2 -> ##
content = re.sub(r"<h2[^>]*>(.*?)</h2>", r"## \1", content, flags=re.IGNORECASE)
# h3 -> ###
content = re.sub(r"<h3[^>]*>(.*?)</h3>", r"### \1", content, flags=re.IGNORECASE)
# h4 -> ####
content = re.sub(r"<h4[^>]*>(.*?)</h4>", r"#### \1", content, flags=re.IGNORECASE)
# p -> 段落
content = re.sub(r"<p[^>]*>(.*?)</p>", r"\1\n\n", content, flags=re.IGNORECASE)
# br -> 换行
content = re.sub(r"<br[^>]*/?>", r"\n", content, flags=re.IGNORECASE)
# ul/ol -> 列表
content = re.sub(r"<li[^>]*>(.*?)</li>", r"- \1", content, flags=re.IGNORECASE)
# blockquote
content = re.sub(r"<blockquote[^>]*>(.*?)</blockquote>", r"> \1", content, flags=re.IGNORECASE | re.DOTALL)
# code inline
content = re.sub(r"<code[^>]*>(.*?)</code>", r"`\1`", content, flags=re.IGNORECASE)
# pre
content = re.sub(r"<pre[^>]*>(.*?)</pre>", r"```\n\1\n```", content, flags=re.IGNORECASE | re.DOTALL)
# 清理残留标签
content = re.sub(r"<[^>]+>", "", content)
# 清理多余空行
content = re.sub(r"\n{3,}", r"\n\n", content)
return content.strip()
def to_plain(self, content: str) -> str:
"""HTML转纯文本
Args:
content: HTML内容
Returns:
纯文本内容
"""
# 移除所有HTML标签
text = re.sub(r"<[^>]+>", "", content)
# 解码HTML实体
text = text.replace("&nbsp;", " ")
text = text.replace("&lt;", "<")
text = text.replace("&gt;", ">")
text = text.replace("&amp;", "&")
text = text.replace("&quot;", '"')
text = text.replace("&#39;", "'")
# 清理多余空格
text = re.sub(r" {2,}", " ", text)
# 清理多余换行
text = re.sub(r"\n{3,}", r"\n\n", text)
return text.strip()