geo/backend/app/services/content/rule_validator.py

319 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""内容规则校验服务"""
import re
from dataclasses import dataclass
from typing import Optional
from app.services.distribution.platform_rules import PLATFORM_RULES
@dataclass
class ValidationIssue:
"""校验问题"""
severity: str # high, medium, low
message: str
category: str
@dataclass
class ValidationResult:
"""校验结果"""
is_valid: bool
score: int
issues: list # list of ValidationIssue
passed: list # list of str
@dataclass
class AI_Pattern:
"""AI写作特征"""
pattern: str
type: str # banned_word, banned_structure
severity: str # medium, high
class RuleValidator:
"""内容规则校验器"""
def validate(self, content: str, title: str, platform: str) -> ValidationResult:
"""
校验内容是否符合平台规则
Args:
content: 内容正文
title: 标题
platform: 平台标识
Returns:
ValidationResult: 校验结果
"""
rules = PLATFORM_RULES.get(platform)
if not rules:
raise ValueError(f"不支持的平台: {platform}")
issues: list[ValidationIssue] = []
passed: list[str] = []
# 标题长度校验
title_len = len(title)
title_rules = rules.get("title_rules", {})
max_title = title_rules.get("max_length", 30)
min_title = title_rules.get("min_length", 5)
if title_len > max_title:
issues.append(ValidationIssue(
"high",
f"标题长度 {title_len} 超过限制 {max_title}",
"title_length"
))
elif title_len < min_title:
issues.append(ValidationIssue(
"medium",
f"标题长度 {title_len} 低于最低要求 {min_title}",
"title_length"
))
else:
passed.append(f"标题长度合规({title_len}/{max_title}")
# 内容长度校验
content_len = len(content)
content_rules = rules.get("content_length", {})
max_content = content_rules.get("max", 20000)
min_content = content_rules.get("min", 0)
if content_len > max_content:
issues.append(ValidationIssue(
"high",
f"内容长度 {content_len} 超过限制 {max_content}",
"content_length"
))
elif min_content > 0 and content_len < min_content:
issues.append(ValidationIssue(
"medium",
f"内容长度 {content_len} 低于建议最低 {min_content}",
"content_length"
))
else:
passed.append(f"内容长度合规({content_len}/{max_content}")
# AI模式检测
ai_sensitivity = rules.get("ai_sensitivity", {})
if ai_sensitivity.get("humanization_required", False):
ai_results = self.detect_ai_patterns(content, platform)
for result in ai_results:
issues.append(ValidationIssue(
"medium",
f"发现AI写作特征: {result.pattern}",
"ai_pattern"
))
# 平台特定规则
platform_issues, platform_passed = self._validate_platform_specific(content, title, platform)
issues.extend(platform_issues)
passed.extend(platform_passed)
# 计算分数
penalty = sum(
15 if i.severity == "high" else 8 if i.severity == "medium" else 3
for i in issues
)
score = max(0, 100 - penalty)
# 判断是否有效无high级别问题
is_valid = all(i.severity != "high" for i in issues)
return ValidationResult(is_valid, score, issues, passed)
def detect_ai_patterns(self, content: str, platform: str) -> list[AI_Pattern]:
"""
检测AI写作模式
Args:
content: 内容正文
platform: 平台标识
Returns:
list[AI_Pattern]: 检测到的AI特征列表
"""
rules = PLATFORM_RULES.get(platform)
if not rules:
return []
results: list[AI_Pattern] = []
ai_config = rules.get("ai_sensitivity", {})
banned_patterns = ai_config.get("banned_patterns", [])
banned_structures = ai_config.get("banned_structures", [])
# 检测禁用词汇
for pattern in banned_patterns:
if pattern in content:
results.append(AI_Pattern(pattern, "banned_word", "medium"))
# 检测禁用结构
for structure in banned_structures:
if re.search(structure, content):
results.append(AI_Pattern(structure, "banned_structure", "medium"))
break
return results
def get_optimization_tips(self, platform: str) -> list[str]:
"""
获取平台优化建议
Args:
platform: 平台标识
Returns:
list[str]: 优化建议列表
"""
rules = PLATFORM_RULES.get(platform)
if not rules:
return []
return rules.get("seo_tips", [])
def _validate_platform_specific(
self, content: str, title: str, platform: str
) -> tuple:
"""平台特定规则校验"""
issues: list[ValidationIssue] = []
passed: list[str] = []
# 诱导分享/关注检测
inducing_patterns = re.compile(
r"(转发|分享|关注|点赞|收藏).{0,4}(领|获|得|拿|解锁|免费)",
re.IGNORECASE,
)
# 连续特殊符号
consecutive_symbols = re.compile(r"[!?]{3,}")
# 外部链接(排除公众号和小程序链接)
external_link = re.compile(
r"https?://(?!mp\.weixin\.qq\.com|wx\.qq\.com|weixin://)[^\s<>)]+",
re.IGNORECASE,
)
# 标题党词汇
clickbait_words = {"震惊", "惊呆", "吓死", "笑死", "疯传", "刷屏", "出大事", "不敢相信"}
# 水印检测
watermark_patterns = re.compile(
r"(抖音|快手|小红书|微博|B站|bilibili).*(水印|logo)",
re.IGNORECASE,
)
if platform == "wechat":
# 诱导分享/关注
if inducing_patterns.search(title) or inducing_patterns.search(content):
issues.append(ValidationIssue(
"high",
"包含诱导分享/关注语句",
"platform_rule"
))
else:
passed.append("无诱导分享/关注语句")
# 连续特殊符号
if consecutive_symbols.search(title):
issues.append(ValidationIssue(
"medium",
"标题包含连续特殊符号",
"title_format"
))
else:
passed.append("标题无连续特殊符号")
# 外部链接
if external_link.search(content):
issues.append(ValidationIssue(
"high",
"正文包含外部链接(仅支持公众号链接和小程序)",
"platform_rule"
))
else:
passed.append("无外部链接")
# 营销用语检测
marketing_words = ["购买", "下单", "优惠价", "限时折扣", "点击购买"]
found_marketing = [w for w in marketing_words if w in content]
if found_marketing:
issues.append(ValidationIssue(
"medium",
f"疑似营销用语: {', '.join(found_marketing)}",
"platform_rule"
))
else:
passed.append("未检测到过度营销用语")
elif platform == "zhihu":
# 营销内容检测
marketing_words = ["购买", "下单", "优惠价", "限时折扣", "点击购买"]
found_marketing = [w for w in marketing_words if w in content]
if found_marketing:
issues.append(ValidationIssue(
"medium",
f"疑似营销用语: {', '.join(found_marketing)}",
"platform_rule"
))
else:
passed.append("未检测到过度营销用语")
elif platform == "xiaohongshu":
# 字数建议
content_len = len(content)
if content_len > 800:
issues.append(ValidationIssue(
"medium",
f"正文建议300-800字当前 {content_len}",
"content_length"
))
elif content_len < 300:
issues.append(ValidationIssue(
"low",
f"正文建议300-800字当前仅 {content_len}",
"content_length"
))
else:
passed.append(f"正文字数适宜({content_len}字)")
# 其他平台引流
cross_platform_keywords = ["微信", "公众号", "抖音号", "微博"]
found_cross = [p for p in cross_platform_keywords if p in content]
if found_cross:
issues.append(ValidationIssue(
"high",
f"疑似其他平台引流: {', '.join(found_cross)}",
"platform_rule"
))
else:
passed.append("未检测到其他平台引流信息")
elif platform in ("baijiahao", "toutiao"):
# 标题党检测
found_clickbait = clickbait_words & set(title)
if found_clickbait:
issues.append(ValidationIssue(
"high",
f"标题含标题党词汇: {', '.join(found_clickbait)}",
"title_content"
))
else:
passed.append("标题无标题党词汇")
elif platform == "douyin":
# 水印检测
if watermark_patterns.search(content):
issues.append(ValidationIssue(
"high",
"内容包含其他平台水印信息",
"platform_rule"
))
else:
passed.append("未检测到其他平台水印")
return issues, passed
# 导出单例
validator = RuleValidator()