geo/backend/app/services/distribution/formatter.py

249 lines
8.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""内容格式化器 - 适配不同平台的格式要求"""
import re
class ContentFormatter:
"""内容格式化器将Markdown内容转换为目标平台的最佳格式"""
def format_for_platform(self, content: str, platform: str) -> str:
"""将Markdown内容转换为目标平台的最佳格式"""
formatter = getattr(self, f"_format_{platform}", self._format_default)
return formatter(content)
def _format_wechat(self, content: str) -> str:
"""微信公众号格式:去外链、段落加粗关键句"""
result = content
# 移除外部链接,保留文本
result = re.sub(
r"\[([^\]]+)\]\((?!https?://mp\.weixin\.qq\.com|https?://wx\.qq\.com|weixin://)(?:[^)]+)\)",
r"\1",
result,
)
# 将Markdown标题转为HTML公众号支持HTML
result = re.sub(r"^### (.+)$", r"<h3>\1</h3>", result, flags=re.MULTILINE)
result = re.sub(r"^## (.+)$", r"<h2>\1</h2>", result, flags=re.MULTILINE)
result = re.sub(r"^# (.+)$", r"<h1>\1</h1>", result, flags=re.MULTILINE)
# 加粗关键句:将 **text** 转为 <strong>text</strong>
result = re.sub(r"\*\*([^*]+)\*\*", r"<strong>\1</strong>", result)
# 图片保留(公众号支持)
# result = re.sub(r"!\[([^\]]*)\]\([^)]+\)", r"[图片:\1]", result)
# 代码块处理
result = re.sub(r"```(\w*)\n", r"<pre><code>", result)
result = re.sub(r"```", "</code></pre>", result)
# 行内代码
result = re.sub(r"`([^`]+)`", r"<code>\1</code>", result)
# 段落分隔:双换行 -> 段落
result = re.sub(r"\n\n+", "\n\n", result)
return result.strip()
def _format_xiaohongshu(self, content: str) -> str:
"""小红书格式短句、emoji、话题标签"""
result = content
# 移除所有Markdown标记
result = self._strip_markdown(result)
# 长句拆分为短句
result = self._split_long_sentences(result)
# 添加分段emoji在段落间插入
result = self._add_paragraph_emojis(result)
# 提取并移动话题标签到文末
tags = re.findall(r"#(\S+)", result)
result = re.sub(r"#\S+\s*", "", result)
# 重新添加标签到文末
if tags:
deduped_tags = list(dict.fromkeys(tags))[:8]
tag_line = " ".join(f"#{t}" for t in deduped_tags)
result = result.strip() + "\n\n" + tag_line
return result.strip()
def _format_zhihu(self, content: str) -> str:
"""知乎格式保留Markdown、添加引用标注"""
result = content
# 知乎支持Markdown基本保留
# 为引用内容添加标注
result = re.sub(
r"(^|\n)(据|根据|参考|来源)[:]?\s*",
r"\1> ",
result,
)
# 确保代码块格式正确
# 将不含语言标识的代码块加上默认标识
result = re.sub(r"```\n", "```text\n", result)
# 图片链接转为知乎兼容格式
# result保持原样知乎编辑器会处理
return result.strip()
def _format_baijiahao(self, content: str) -> str:
"""百家号格式转为HTML、结构化小标题"""
result = content
# 标题转HTML
result = re.sub(r"^### (.+)$", r"<h3>\1</h3>", result, flags=re.MULTILINE)
result = re.sub(r"^## (.+)$", r"<h2>\1</h2>", result, flags=re.MULTILINE)
result = re.sub(r"^# (.+)$", r"<h1>\1</h1>", result, flags=re.MULTILINE)
# 加粗
result = re.sub(r"\*\*([^*]+)\*\*", r"<strong>\1</strong>", result)
# 移除代码块(百家号不支持)
result = re.sub(r"```[\s\S]*?```", self._code_block_to_text, result)
# 行内代码移除反引号
result = re.sub(r"`([^`]+)`", r"\1", result)
# 链接保留文本
result = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", result)
return result.strip()
def _format_douyin(self, content: str) -> str:
"""抖音格式:纯文本+emoji极简"""
result = self._strip_markdown(content)
# 限制长度(抖音文案建议简短)
if len(result) > 500:
# 取前500字在句号处截断
truncated = result[:500]
last_period = max(
truncated.rfind(""),
truncated.rfind(""),
truncated.rfind(""),
)
if last_period > 200:
result = truncated[: last_period + 1]
else:
result = truncated
return result.strip()
def _format_toutiao(self, content: str) -> str:
"""今日头条格式HTML+结构化"""
result = content
# 标题转HTML
result = re.sub(r"^### (.+)$", r"<h3>\1</h3>", result, flags=re.MULTILINE)
result = re.sub(r"^## (.+)$", r"<h2>\1</h2>", result, flags=re.MULTILINE)
result = re.sub(r"^# (.+)$", r"<h1>\1</h1>", result, flags=re.MULTILINE)
# 加粗
result = re.sub(r"\*\*([^*]+)\*\*", r"<strong>\1</strong>", result)
# 移除代码块
result = re.sub(r"```[\s\S]*?```", self._code_block_to_text, result)
result = re.sub(r"`([^`]+)`", r"\1", result)
# 链接保留文本
result = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", result)
return result.strip()
def _format_default(self, content: str) -> str:
"""默认格式清理Markdown为纯文本"""
return self._strip_markdown(content).strip()
# --- 辅助方法 ---
@staticmethod
def _strip_markdown(text: str) -> str:
"""清理所有Markdown标记为纯文本"""
result = text
# 移除代码块
result = re.sub(r"```[\s\S]*?```", ContentFormatter._code_block_to_text, result)
# 移除行内代码
result = re.sub(r"`([^`]+)`", r"\1", result)
# 移除图片保留alt
result = re.sub(r"!\[([^\]]*)\]\([^)]+\)", r"[\1]", result)
# 移除链接,保留文本
result = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", result)
# 移除加粗/斜体
result = re.sub(r"\*\*\*([^*]+)\*\*\*", r"\1", result)
result = re.sub(r"\*\*([^*]+)\*\*", r"\1", result)
result = re.sub(r"\*([^*]+)\*", r"\1", result)
# 移除标题标记
result = re.sub(r"^#{1,6}\s+", "", result, flags=re.MULTILINE)
# 移除引用标记
result = re.sub(r"^>\s+", "", result, flags=re.MULTILINE)
# 移除水平线
result = re.sub(r"^---+$", "", result, flags=re.MULTILINE)
# 移除列表标记
result = re.sub(r"^[\s]*[-*+]\s+", "", result, flags=re.MULTILINE)
result = re.compile(r"^[\s]*\d+\.\s+", flags=re.MULTILINE).sub("", result)
return result
@staticmethod
def _code_block_to_text(match: re.Match | str) -> str:
"""将代码块匹配转为纯文本"""
if isinstance(match, re.Match):
text = match.group(0)
else:
text = match
# 移除代码围栏,保留内容
text = re.sub(r"^```\w*\n?", "", text)
text = re.sub(r"\n?```$", "", text)
return text.strip()
@staticmethod
def _split_long_sentences(text: str) -> str:
"""将长句拆分为短句"""
result = text
# 在句号后如果跟长文本,插入换行
result = re.sub(r"([。!?])", r"\1\n", result)
# 清理多余换行
result = re.sub(r"\n{3,}", "\n\n", result)
return result
@staticmethod
def _add_paragraph_emojis(text: str) -> str:
"""在段落间添加emoji小红书风格"""
emojis = ["", "💡", "📌", "🔥", "", "🎯", "💪", "🌟"]
lines = text.split("\n")
result: list[str] = []
emoji_idx = 0
for i, line in enumerate(lines):
stripped = line.strip()
if not stripped:
if result and result[-1].strip():
result.append("")
continue
# 每隔几段添加emoji
if i > 0 and stripped and not stripped.startswith("#") and emoji_idx < len(emojis):
if result and result[-1].strip() == "":
result.append(f"{emojis[emoji_idx]} {stripped}")
emoji_idx += 1
else:
result.append(stripped)
else:
result.append(stripped)
return "\n".join(result)