119 lines
3.9 KiB
Python
119 lines
3.9 KiB
Python
import re
|
|
from typing import Optional
|
|
|
|
|
|
class HTMLGenerator:
|
|
"""HTML生成器 - 根据平台规则生成适配HTML"""
|
|
|
|
def generate(self, content: str, platform: str, format: str = "html") -> str:
|
|
"""根据平台规则生成HTML
|
|
|
|
Args:
|
|
content: HTML内容
|
|
platform: 平台标识
|
|
format: 输出格式 (html/markdown/plain)
|
|
|
|
Returns:
|
|
处理后的内容
|
|
"""
|
|
from app.services.distribution.platform_rules import PLATFORM_RULES
|
|
|
|
rules = PLATFORM_RULES.get(platform, {})
|
|
html_rules = rules.get("html_rules", {})
|
|
|
|
# 获取平台支持的标签和禁用标签
|
|
banned_tags = html_rules.get("banned_tags", [])
|
|
|
|
result = content
|
|
|
|
# 移除禁用的标签及其内容
|
|
for tag in banned_tags:
|
|
# 移除带内容的标签
|
|
result = re.sub(
|
|
f"<{tag}[^>]*>.*?</{tag}>", "", result, flags=re.DOTALL | re.IGNORECASE
|
|
)
|
|
# 移除自闭合标签
|
|
result = re.sub(f"<{tag}[^>]*/?>", "", result, flags=re.IGNORECASE)
|
|
|
|
# 平台特定处理
|
|
if platform == "wechat":
|
|
# 微信公众号:移除外部链接
|
|
result = re.sub(
|
|
r"<a[^>]*href=['\"]https?://(?!mp\.weixin\.qq\.com)[^'\"]*['\"][^>]*>",
|
|
"",
|
|
result,
|
|
flags=re.IGNORECASE,
|
|
)
|
|
# 移除链接文本但保留内部内容
|
|
result = re.sub(
|
|
r"</a>", "", result, flags=re.IGNORECASE
|
|
)
|
|
|
|
if format == "markdown":
|
|
return self.to_markdown(result)
|
|
elif format == "plain":
|
|
return self.to_plain(result)
|
|
|
|
return result
|
|
|
|
def to_markdown(self, content: str) -> str:
|
|
"""HTML转Markdown
|
|
|
|
Args:
|
|
content: HTML内容
|
|
|
|
Returns:
|
|
Markdown格式内容
|
|
"""
|
|
# h1 -> #
|
|
content = re.sub(r"<h1[^>]*>(.*?)</h1>", r"# \1", content, flags=re.IGNORECASE)
|
|
# h2 -> ##
|
|
content = re.sub(r"<h2[^>]*>(.*?)</h2>", r"## \1", content, flags=re.IGNORECASE)
|
|
# h3 -> ###
|
|
content = re.sub(r"<h3[^>]*>(.*?)</h3>", r"### \1", content, flags=re.IGNORECASE)
|
|
# h4 -> ####
|
|
content = re.sub(r"<h4[^>]*>(.*?)</h4>", r"#### \1", content, flags=re.IGNORECASE)
|
|
# p -> 段落
|
|
content = re.sub(r"<p[^>]*>(.*?)</p>", r"\1\n\n", content, flags=re.IGNORECASE)
|
|
# br -> 换行
|
|
content = re.sub(r"<br[^>]*/?>", r"\n", content, flags=re.IGNORECASE)
|
|
# ul/ol -> 列表
|
|
content = re.sub(r"<li[^>]*>(.*?)</li>", r"- \1", content, flags=re.IGNORECASE)
|
|
# blockquote
|
|
content = re.sub(r"<blockquote[^>]*>(.*?)</blockquote>", r"> \1", content, flags=re.IGNORECASE | re.DOTALL)
|
|
# code inline
|
|
content = re.sub(r"<code[^>]*>(.*?)</code>", r"`\1`", content, flags=re.IGNORECASE)
|
|
# pre
|
|
content = re.sub(r"<pre[^>]*>(.*?)</pre>", r"```\n\1\n```", content, flags=re.IGNORECASE | re.DOTALL)
|
|
# 清理残留标签
|
|
content = re.sub(r"<[^>]+>", "", content)
|
|
# 清理多余空行
|
|
content = re.sub(r"\n{3,}", r"\n\n", content)
|
|
|
|
return content.strip()
|
|
|
|
def to_plain(self, content: str) -> str:
|
|
"""HTML转纯文本
|
|
|
|
Args:
|
|
content: HTML内容
|
|
|
|
Returns:
|
|
纯文本内容
|
|
"""
|
|
# 移除所有HTML标签
|
|
text = re.sub(r"<[^>]+>", "", content)
|
|
# 解码HTML实体
|
|
text = text.replace(" ", " ")
|
|
text = text.replace("<", "<")
|
|
text = text.replace(">", ">")
|
|
text = text.replace("&", "&")
|
|
text = text.replace(""", '"')
|
|
text = text.replace("'", "'")
|
|
# 清理多余空格
|
|
text = re.sub(r" {2,}", " ", text)
|
|
# 清理多余换行
|
|
text = re.sub(r"\n{3,}", r"\n\n", text)
|
|
|
|
return text.strip()
|