249 lines
8.5 KiB
Python
249 lines
8.5 KiB
Python
"""内容格式化器 - 适配不同平台的格式要求"""
|
||
|
||
import re
|
||
|
||
|
||
class ContentFormatter:
|
||
"""内容格式化器:将Markdown内容转换为目标平台的最佳格式"""
|
||
|
||
def format_for_platform(self, content: str, platform: str) -> str:
|
||
"""将Markdown内容转换为目标平台的最佳格式"""
|
||
formatter = getattr(self, f"_format_{platform}", self._format_default)
|
||
return formatter(content)
|
||
|
||
def _format_wechat(self, content: str) -> str:
|
||
"""微信公众号格式:去外链、段落加粗关键句"""
|
||
result = content
|
||
|
||
# 移除外部链接,保留文本
|
||
result = re.sub(
|
||
r"\[([^\]]+)\]\((?!https?://mp\.weixin\.qq\.com|https?://wx\.qq\.com|weixin://)(?:[^)]+)\)",
|
||
r"\1",
|
||
result,
|
||
)
|
||
|
||
# 将Markdown标题转为HTML(公众号支持HTML)
|
||
result = re.sub(r"^### (.+)$", r"<h3>\1</h3>", result, flags=re.MULTILINE)
|
||
result = re.sub(r"^## (.+)$", r"<h2>\1</h2>", result, flags=re.MULTILINE)
|
||
result = re.sub(r"^# (.+)$", r"<h1>\1</h1>", result, flags=re.MULTILINE)
|
||
|
||
# 加粗关键句:将 **text** 转为 <strong>text</strong>
|
||
result = re.sub(r"\*\*([^*]+)\*\*", r"<strong>\1</strong>", result)
|
||
|
||
# 图片保留(公众号支持)
|
||
# result = re.sub(r"!\[([^\]]*)\]\([^)]+\)", r"[图片:\1]", result)
|
||
|
||
# 代码块处理
|
||
result = re.sub(r"```(\w*)\n", r"<pre><code>", result)
|
||
result = re.sub(r"```", "</code></pre>", result)
|
||
|
||
# 行内代码
|
||
result = re.sub(r"`([^`]+)`", r"<code>\1</code>", result)
|
||
|
||
# 段落分隔:双换行 -> 段落
|
||
result = re.sub(r"\n\n+", "\n\n", result)
|
||
|
||
return result.strip()
|
||
|
||
def _format_xiaohongshu(self, content: str) -> str:
|
||
"""小红书格式:短句、emoji、话题标签"""
|
||
result = content
|
||
|
||
# 移除所有Markdown标记
|
||
result = self._strip_markdown(result)
|
||
|
||
# 长句拆分为短句
|
||
result = self._split_long_sentences(result)
|
||
|
||
# 添加分段emoji(在段落间插入)
|
||
result = self._add_paragraph_emojis(result)
|
||
|
||
# 提取并移动话题标签到文末
|
||
tags = re.findall(r"#(\S+)", result)
|
||
result = re.sub(r"#\S+\s*", "", result)
|
||
|
||
# 重新添加标签到文末
|
||
if tags:
|
||
deduped_tags = list(dict.fromkeys(tags))[:8]
|
||
tag_line = " ".join(f"#{t}" for t in deduped_tags)
|
||
result = result.strip() + "\n\n" + tag_line
|
||
|
||
return result.strip()
|
||
|
||
def _format_zhihu(self, content: str) -> str:
|
||
"""知乎格式:保留Markdown、添加引用标注"""
|
||
result = content
|
||
|
||
# 知乎支持Markdown,基本保留
|
||
# 为引用内容添加标注
|
||
result = re.sub(
|
||
r"(^|\n)(据|根据|参考|来源)[::]?\s*",
|
||
r"\1> ",
|
||
result,
|
||
)
|
||
|
||
# 确保代码块格式正确
|
||
# 将不含语言标识的代码块加上默认标识
|
||
result = re.sub(r"```\n", "```text\n", result)
|
||
|
||
# 图片链接转为知乎兼容格式
|
||
# result保持原样,知乎编辑器会处理
|
||
|
||
return result.strip()
|
||
|
||
def _format_baijiahao(self, content: str) -> str:
|
||
"""百家号格式:转为HTML、结构化小标题"""
|
||
result = content
|
||
|
||
# 标题转HTML
|
||
result = re.sub(r"^### (.+)$", r"<h3>\1</h3>", result, flags=re.MULTILINE)
|
||
result = re.sub(r"^## (.+)$", r"<h2>\1</h2>", result, flags=re.MULTILINE)
|
||
result = re.sub(r"^# (.+)$", r"<h1>\1</h1>", result, flags=re.MULTILINE)
|
||
|
||
# 加粗
|
||
result = re.sub(r"\*\*([^*]+)\*\*", r"<strong>\1</strong>", result)
|
||
|
||
# 移除代码块(百家号不支持)
|
||
result = re.sub(r"```[\s\S]*?```", self._code_block_to_text, result)
|
||
|
||
# 行内代码移除反引号
|
||
result = re.sub(r"`([^`]+)`", r"\1", result)
|
||
|
||
# 链接保留文本
|
||
result = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", result)
|
||
|
||
return result.strip()
|
||
|
||
def _format_douyin(self, content: str) -> str:
|
||
"""抖音格式:纯文本+emoji,极简"""
|
||
result = self._strip_markdown(content)
|
||
|
||
# 限制长度(抖音文案建议简短)
|
||
if len(result) > 500:
|
||
# 取前500字,在句号处截断
|
||
truncated = result[:500]
|
||
last_period = max(
|
||
truncated.rfind("。"),
|
||
truncated.rfind("!"),
|
||
truncated.rfind("?"),
|
||
)
|
||
if last_period > 200:
|
||
result = truncated[: last_period + 1]
|
||
else:
|
||
result = truncated
|
||
|
||
return result.strip()
|
||
|
||
def _format_toutiao(self, content: str) -> str:
|
||
"""今日头条格式:HTML+结构化"""
|
||
result = content
|
||
|
||
# 标题转HTML
|
||
result = re.sub(r"^### (.+)$", r"<h3>\1</h3>", result, flags=re.MULTILINE)
|
||
result = re.sub(r"^## (.+)$", r"<h2>\1</h2>", result, flags=re.MULTILINE)
|
||
result = re.sub(r"^# (.+)$", r"<h1>\1</h1>", result, flags=re.MULTILINE)
|
||
|
||
# 加粗
|
||
result = re.sub(r"\*\*([^*]+)\*\*", r"<strong>\1</strong>", result)
|
||
|
||
# 移除代码块
|
||
result = re.sub(r"```[\s\S]*?```", self._code_block_to_text, result)
|
||
result = re.sub(r"`([^`]+)`", r"\1", result)
|
||
|
||
# 链接保留文本
|
||
result = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", result)
|
||
|
||
return result.strip()
|
||
|
||
def _format_default(self, content: str) -> str:
|
||
"""默认格式:清理Markdown为纯文本"""
|
||
return self._strip_markdown(content).strip()
|
||
|
||
# --- 辅助方法 ---
|
||
|
||
@staticmethod
|
||
def _strip_markdown(text: str) -> str:
|
||
"""清理所有Markdown标记为纯文本"""
|
||
result = text
|
||
|
||
# 移除代码块
|
||
result = re.sub(r"```[\s\S]*?```", ContentFormatter._code_block_to_text, result)
|
||
|
||
# 移除行内代码
|
||
result = re.sub(r"`([^`]+)`", r"\1", result)
|
||
|
||
# 移除图片,保留alt
|
||
result = re.sub(r"!\[([^\]]*)\]\([^)]+\)", r"[\1]", result)
|
||
|
||
# 移除链接,保留文本
|
||
result = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", result)
|
||
|
||
# 移除加粗/斜体
|
||
result = re.sub(r"\*\*\*([^*]+)\*\*\*", r"\1", result)
|
||
result = re.sub(r"\*\*([^*]+)\*\*", r"\1", result)
|
||
result = re.sub(r"\*([^*]+)\*", r"\1", result)
|
||
|
||
# 移除标题标记
|
||
result = re.sub(r"^#{1,6}\s+", "", result, flags=re.MULTILINE)
|
||
|
||
# 移除引用标记
|
||
result = re.sub(r"^>\s+", "", result, flags=re.MULTILINE)
|
||
|
||
# 移除水平线
|
||
result = re.sub(r"^---+$", "", result, flags=re.MULTILINE)
|
||
|
||
# 移除列表标记
|
||
result = re.sub(r"^[\s]*[-*+]\s+", "", result, flags=re.MULTILINE)
|
||
result = re.compile(r"^[\s]*\d+\.\s+", flags=re.MULTILINE).sub("", result)
|
||
|
||
return result
|
||
|
||
@staticmethod
|
||
def _code_block_to_text(match: re.Match | str) -> str:
|
||
"""将代码块匹配转为纯文本"""
|
||
if isinstance(match, re.Match):
|
||
text = match.group(0)
|
||
else:
|
||
text = match
|
||
# 移除代码围栏,保留内容
|
||
text = re.sub(r"^```\w*\n?", "", text)
|
||
text = re.sub(r"\n?```$", "", text)
|
||
return text.strip()
|
||
|
||
@staticmethod
|
||
def _split_long_sentences(text: str) -> str:
|
||
"""将长句拆分为短句"""
|
||
result = text
|
||
# 在句号后如果跟长文本,插入换行
|
||
result = re.sub(r"([。!?])", r"\1\n", result)
|
||
# 清理多余换行
|
||
result = re.sub(r"\n{3,}", "\n\n", result)
|
||
return result
|
||
|
||
@staticmethod
|
||
def _add_paragraph_emojis(text: str) -> str:
|
||
"""在段落间添加emoji(小红书风格)"""
|
||
emojis = ["✨", "💡", "📌", "🔥", "⭐", "🎯", "💪", "🌟"]
|
||
lines = text.split("\n")
|
||
result: list[str] = []
|
||
emoji_idx = 0
|
||
|
||
for i, line in enumerate(lines):
|
||
stripped = line.strip()
|
||
if not stripped:
|
||
if result and result[-1].strip():
|
||
result.append("")
|
||
continue
|
||
|
||
# 每隔几段添加emoji
|
||
if i > 0 and stripped and not stripped.startswith("#") and emoji_idx < len(emojis):
|
||
if result and result[-1].strip() == "":
|
||
result.append(f"{emojis[emoji_idx]} {stripped}")
|
||
emoji_idx += 1
|
||
else:
|
||
result.append(stripped)
|
||
else:
|
||
result.append(stripped)
|
||
|
||
return "\n".join(result)
|