319 lines
10 KiB
Python
319 lines
10 KiB
Python
"""内容规则校验服务"""
|
||
import re
|
||
from dataclasses import dataclass
|
||
from typing import Optional
|
||
|
||
from app.services.distribution.platform_rules import PLATFORM_RULES
|
||
|
||
|
||
@dataclass
|
||
class ValidationIssue:
|
||
"""校验问题"""
|
||
severity: str # high, medium, low
|
||
message: str
|
||
category: str
|
||
|
||
|
||
@dataclass
|
||
class ValidationResult:
|
||
"""校验结果"""
|
||
is_valid: bool
|
||
score: int
|
||
issues: list # list of ValidationIssue
|
||
passed: list # list of str
|
||
|
||
|
||
@dataclass
|
||
class AI_Pattern:
|
||
"""AI写作特征"""
|
||
pattern: str
|
||
type: str # banned_word, banned_structure
|
||
severity: str # medium, high
|
||
|
||
|
||
class RuleValidator:
|
||
"""内容规则校验器"""
|
||
|
||
def validate(self, content: str, title: str, platform: str) -> ValidationResult:
|
||
"""
|
||
校验内容是否符合平台规则
|
||
|
||
Args:
|
||
content: 内容正文
|
||
title: 标题
|
||
platform: 平台标识
|
||
|
||
Returns:
|
||
ValidationResult: 校验结果
|
||
"""
|
||
rules = PLATFORM_RULES.get(platform)
|
||
if not rules:
|
||
raise ValueError(f"不支持的平台: {platform}")
|
||
|
||
issues: list[ValidationIssue] = []
|
||
passed: list[str] = []
|
||
|
||
# 标题长度校验
|
||
title_len = len(title)
|
||
title_rules = rules.get("title_rules", {})
|
||
max_title = title_rules.get("max_length", 30)
|
||
min_title = title_rules.get("min_length", 5)
|
||
|
||
if title_len > max_title:
|
||
issues.append(ValidationIssue(
|
||
"high",
|
||
f"标题长度 {title_len} 超过限制 {max_title}",
|
||
"title_length"
|
||
))
|
||
elif title_len < min_title:
|
||
issues.append(ValidationIssue(
|
||
"medium",
|
||
f"标题长度 {title_len} 低于最低要求 {min_title}",
|
||
"title_length"
|
||
))
|
||
else:
|
||
passed.append(f"标题长度合规({title_len}/{max_title})")
|
||
|
||
# 内容长度校验
|
||
content_len = len(content)
|
||
content_rules = rules.get("content_length", {})
|
||
max_content = content_rules.get("max", 20000)
|
||
min_content = content_rules.get("min", 0)
|
||
|
||
if content_len > max_content:
|
||
issues.append(ValidationIssue(
|
||
"high",
|
||
f"内容长度 {content_len} 超过限制 {max_content}",
|
||
"content_length"
|
||
))
|
||
elif min_content > 0 and content_len < min_content:
|
||
issues.append(ValidationIssue(
|
||
"medium",
|
||
f"内容长度 {content_len} 低于建议最低 {min_content}",
|
||
"content_length"
|
||
))
|
||
else:
|
||
passed.append(f"内容长度合规({content_len}/{max_content})")
|
||
|
||
# AI模式检测
|
||
ai_sensitivity = rules.get("ai_sensitivity", {})
|
||
if ai_sensitivity.get("humanization_required", False):
|
||
ai_results = self.detect_ai_patterns(content, platform)
|
||
for result in ai_results:
|
||
issues.append(ValidationIssue(
|
||
"medium",
|
||
f"发现AI写作特征: {result.pattern}",
|
||
"ai_pattern"
|
||
))
|
||
|
||
# 平台特定规则
|
||
platform_issues, platform_passed = self._validate_platform_specific(content, title, platform)
|
||
issues.extend(platform_issues)
|
||
passed.extend(platform_passed)
|
||
|
||
# 计算分数
|
||
penalty = sum(
|
||
15 if i.severity == "high" else 8 if i.severity == "medium" else 3
|
||
for i in issues
|
||
)
|
||
score = max(0, 100 - penalty)
|
||
|
||
# 判断是否有效(无high级别问题)
|
||
is_valid = all(i.severity != "high" for i in issues)
|
||
|
||
return ValidationResult(is_valid, score, issues, passed)
|
||
|
||
def detect_ai_patterns(self, content: str, platform: str) -> list[AI_Pattern]:
|
||
"""
|
||
检测AI写作模式
|
||
|
||
Args:
|
||
content: 内容正文
|
||
platform: 平台标识
|
||
|
||
Returns:
|
||
list[AI_Pattern]: 检测到的AI特征列表
|
||
"""
|
||
rules = PLATFORM_RULES.get(platform)
|
||
if not rules:
|
||
return []
|
||
|
||
results: list[AI_Pattern] = []
|
||
ai_config = rules.get("ai_sensitivity", {})
|
||
banned_patterns = ai_config.get("banned_patterns", [])
|
||
banned_structures = ai_config.get("banned_structures", [])
|
||
|
||
# 检测禁用词汇
|
||
for pattern in banned_patterns:
|
||
if pattern in content:
|
||
results.append(AI_Pattern(pattern, "banned_word", "medium"))
|
||
|
||
# 检测禁用结构
|
||
for structure in banned_structures:
|
||
if re.search(structure, content):
|
||
results.append(AI_Pattern(structure, "banned_structure", "medium"))
|
||
break
|
||
|
||
return results
|
||
|
||
def get_optimization_tips(self, platform: str) -> list[str]:
|
||
"""
|
||
获取平台优化建议
|
||
|
||
Args:
|
||
platform: 平台标识
|
||
|
||
Returns:
|
||
list[str]: 优化建议列表
|
||
"""
|
||
rules = PLATFORM_RULES.get(platform)
|
||
if not rules:
|
||
return []
|
||
return rules.get("seo_tips", [])
|
||
|
||
def _validate_platform_specific(
|
||
self, content: str, title: str, platform: str
|
||
) -> tuple:
|
||
"""平台特定规则校验"""
|
||
issues: list[ValidationIssue] = []
|
||
passed: list[str] = []
|
||
|
||
# 诱导分享/关注检测
|
||
inducing_patterns = re.compile(
|
||
r"(转发|分享|关注|点赞|收藏).{0,4}(领|获|得|拿|解锁|免费)",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
# 连续特殊符号
|
||
consecutive_symbols = re.compile(r"[!!??]{3,}")
|
||
|
||
# 外部链接(排除公众号和小程序链接)
|
||
external_link = re.compile(
|
||
r"https?://(?!mp\.weixin\.qq\.com|wx\.qq\.com|weixin://)[^\s<>))]+",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
# 标题党词汇
|
||
clickbait_words = {"震惊", "惊呆", "吓死", "笑死", "疯传", "刷屏", "出大事", "不敢相信"}
|
||
|
||
# 水印检测
|
||
watermark_patterns = re.compile(
|
||
r"(抖音|快手|小红书|微博|B站|bilibili).*(水印|logo)",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
if platform == "wechat":
|
||
# 诱导分享/关注
|
||
if inducing_patterns.search(title) or inducing_patterns.search(content):
|
||
issues.append(ValidationIssue(
|
||
"high",
|
||
"包含诱导分享/关注语句",
|
||
"platform_rule"
|
||
))
|
||
else:
|
||
passed.append("无诱导分享/关注语句")
|
||
|
||
# 连续特殊符号
|
||
if consecutive_symbols.search(title):
|
||
issues.append(ValidationIssue(
|
||
"medium",
|
||
"标题包含连续特殊符号",
|
||
"title_format"
|
||
))
|
||
else:
|
||
passed.append("标题无连续特殊符号")
|
||
|
||
# 外部链接
|
||
if external_link.search(content):
|
||
issues.append(ValidationIssue(
|
||
"high",
|
||
"正文包含外部链接(仅支持公众号链接和小程序)",
|
||
"platform_rule"
|
||
))
|
||
else:
|
||
passed.append("无外部链接")
|
||
|
||
# 营销用语检测
|
||
marketing_words = ["购买", "下单", "优惠价", "限时折扣", "点击购买"]
|
||
found_marketing = [w for w in marketing_words if w in content]
|
||
if found_marketing:
|
||
issues.append(ValidationIssue(
|
||
"medium",
|
||
f"疑似营销用语: {', '.join(found_marketing)}",
|
||
"platform_rule"
|
||
))
|
||
else:
|
||
passed.append("未检测到过度营销用语")
|
||
|
||
elif platform == "zhihu":
|
||
# 营销内容检测
|
||
marketing_words = ["购买", "下单", "优惠价", "限时折扣", "点击购买"]
|
||
found_marketing = [w for w in marketing_words if w in content]
|
||
if found_marketing:
|
||
issues.append(ValidationIssue(
|
||
"medium",
|
||
f"疑似营销用语: {', '.join(found_marketing)}",
|
||
"platform_rule"
|
||
))
|
||
else:
|
||
passed.append("未检测到过度营销用语")
|
||
|
||
elif platform == "xiaohongshu":
|
||
# 字数建议
|
||
content_len = len(content)
|
||
if content_len > 800:
|
||
issues.append(ValidationIssue(
|
||
"medium",
|
||
f"正文建议300-800字,当前 {content_len} 字",
|
||
"content_length"
|
||
))
|
||
elif content_len < 300:
|
||
issues.append(ValidationIssue(
|
||
"low",
|
||
f"正文建议300-800字,当前仅 {content_len} 字",
|
||
"content_length"
|
||
))
|
||
else:
|
||
passed.append(f"正文字数适宜({content_len}字)")
|
||
|
||
# 其他平台引流
|
||
cross_platform_keywords = ["微信", "公众号", "抖音号", "微博"]
|
||
found_cross = [p for p in cross_platform_keywords if p in content]
|
||
if found_cross:
|
||
issues.append(ValidationIssue(
|
||
"high",
|
||
f"疑似其他平台引流: {', '.join(found_cross)}",
|
||
"platform_rule"
|
||
))
|
||
else:
|
||
passed.append("未检测到其他平台引流信息")
|
||
|
||
elif platform in ("baijiahao", "toutiao"):
|
||
# 标题党检测
|
||
found_clickbait = {w for w in clickbait_words if w in title}
|
||
if found_clickbait:
|
||
issues.append(ValidationIssue(
|
||
"high",
|
||
f"标题含标题党词汇: {', '.join(found_clickbait)}",
|
||
"title_content"
|
||
))
|
||
else:
|
||
passed.append("标题无标题党词汇")
|
||
|
||
elif platform == "douyin":
|
||
# 水印检测
|
||
if watermark_patterns.search(content):
|
||
issues.append(ValidationIssue(
|
||
"high",
|
||
"内容包含其他平台水印信息",
|
||
"platform_rule"
|
||
))
|
||
else:
|
||
passed.append("未检测到其他平台水印")
|
||
|
||
return issues, passed
|
||
|
||
|
||
# 导出单例
|
||
validator = RuleValidator()
|