geo/backend/app/services/content/rule_validator.py

"""内容规则校验服务"""
import re
from dataclasses import dataclass
from typing import Optional

from app.services.distribution.platform_rules import PLATFORM_RULES


@dataclass
class ValidationIssue:
    """校验问题"""
    severity: str  # high, medium, low
    message: str
    category: str


@dataclass
class ValidationResult:
    """校验结果"""
    is_valid: bool
    score: int
    issues: list  # list of ValidationIssue
    passed: list  # list of str


@dataclass
class AI_Pattern:
    """AI写作特征"""
    pattern: str
    type: str  # banned_word, banned_structure
    severity: str  # medium, high


class RuleValidator:
    """内容规则校验器"""

    def validate(self, content: str, title: str, platform: str) -> ValidationResult:
        """
        校验内容是否符合平台规则

        Args:
            content: 内容正文
            title: 标题
            platform: 平台标识

        Returns:
            ValidationResult: 校验结果
        """
        rules = PLATFORM_RULES.get(platform)
        if not rules:
            raise ValueError(f"不支持的平台: {platform}")

        issues: list[ValidationIssue] = []
        passed: list[str] = []

        # 标题长度校验
        title_len = len(title)
        title_rules = rules.get("title_rules", {})
        max_title = title_rules.get("max_length", 30)
        min_title = title_rules.get("min_length", 5)

        if title_len > max_title:
            issues.append(ValidationIssue(
                "high",
                f"标题长度 {title_len} 超过限制 {max_title}",
                "title_length"
            ))
        elif title_len < min_title:
            issues.append(ValidationIssue(
                "medium",
                f"标题长度 {title_len} 低于最低要求 {min_title}",
                "title_length"
            ))
        else:
            passed.append(f"标题长度合规（{title_len}/{max_title}）")

        # 内容长度校验
        content_len = len(content)
        content_rules = rules.get("content_length", {})
        max_content = content_rules.get("max", 20000)
        min_content = content_rules.get("min", 0)

        if content_len > max_content:
            issues.append(ValidationIssue(
                "high",
                f"内容长度 {content_len} 超过限制 {max_content}",
                "content_length"
            ))
        elif min_content > 0 and content_len < min_content:
            issues.append(ValidationIssue(
                "medium",
                f"内容长度 {content_len} 低于建议最低 {min_content}",
                "content_length"
            ))
        else:
            passed.append(f"内容长度合规（{content_len}/{max_content}）")

        # AI模式检测
        ai_sensitivity = rules.get("ai_sensitivity", {})
        if ai_sensitivity.get("humanization_required", False):
            ai_results = self.detect_ai_patterns(content, platform)
            for result in ai_results:
                issues.append(ValidationIssue(
                    "medium",
                    f"发现AI写作特征: {result.pattern}",
                    "ai_pattern"
                ))

        # 平台特定规则
        platform_issues, platform_passed = self._validate_platform_specific(content, title, platform)
        issues.extend(platform_issues)
        passed.extend(platform_passed)

        # 计算分数
        penalty = sum(
            15 if i.severity == "high" else 8 if i.severity == "medium" else 3
            for i in issues
        )
        score = max(0, 100 - penalty)

        # 判断是否有效（无high级别问题）
        is_valid = all(i.severity != "high" for i in issues)

        return ValidationResult(is_valid, score, issues, passed)

    def detect_ai_patterns(self, content: str, platform: str) -> list[AI_Pattern]:
        """
        检测AI写作模式

        Args:
            content: 内容正文
            platform: 平台标识

        Returns:
            list[AI_Pattern]: 检测到的AI特征列表
        """
        rules = PLATFORM_RULES.get(platform)
        if not rules:
            return []

        results: list[AI_Pattern] = []
        ai_config = rules.get("ai_sensitivity", {})
        banned_patterns = ai_config.get("banned_patterns", [])
        banned_structures = ai_config.get("banned_structures", [])

        # 检测禁用词汇
        for pattern in banned_patterns:
            if pattern in content:
                results.append(AI_Pattern(pattern, "banned_word", "medium"))

        # 检测禁用结构
        for structure in banned_structures:
            if re.search(structure, content):
                results.append(AI_Pattern(structure, "banned_structure", "medium"))
                break

        return results

    def get_optimization_tips(self, platform: str) -> list[str]:
        """
        获取平台优化建议

        Args:
            platform: 平台标识

        Returns:
            list[str]: 优化建议列表
        """
        rules = PLATFORM_RULES.get(platform)
        if not rules:
            return []
        return rules.get("seo_tips", [])

    def _validate_platform_specific(
        self, content: str, title: str, platform: str
    ) -> tuple:
        """平台特定规则校验"""
        issues: list[ValidationIssue] = []
        passed: list[str] = []

        # 诱导分享/关注检测
        inducing_patterns = re.compile(
            r"(转发|分享|关注|点赞|收藏).{0,4}(领|获|得|拿|解锁|免费)",
            re.IGNORECASE,
        )

        # 连续特殊符号
        consecutive_symbols = re.compile(r"[!！?？]{3,}")

        # 外部链接（排除公众号和小程序链接）
        external_link = re.compile(
            r"https?://(?!mp\.weixin\.qq\.com|wx\.qq\.com|weixin://)[^\s<>)）]+",
            re.IGNORECASE,
        )

        # 标题党词汇
        clickbait_words = {"震惊", "惊呆", "吓死", "笑死", "疯传", "刷屏", "出大事", "不敢相信"}

        # 水印检测
        watermark_patterns = re.compile(
            r"(抖音|快手|小红书|微博|B站|bilibili).*(水印|logo)",
            re.IGNORECASE,
        )

        if platform == "wechat":
            # 诱导分享/关注
            if inducing_patterns.search(title) or inducing_patterns.search(content):
                issues.append(ValidationIssue(
                    "high",
                    "包含诱导分享/关注语句",
                    "platform_rule"
                ))
            else:
                passed.append("无诱导分享/关注语句")

            # 连续特殊符号
            if consecutive_symbols.search(title):
                issues.append(ValidationIssue(
                    "medium",
                    "标题包含连续特殊符号",
                    "title_format"
                ))
            else:
                passed.append("标题无连续特殊符号")

            # 外部链接
            if external_link.search(content):
                issues.append(ValidationIssue(
                    "high",
                    "正文包含外部链接（仅支持公众号链接和小程序）",
                    "platform_rule"
                ))
            else:
                passed.append("无外部链接")

            # 营销用语检测
            marketing_words = ["购买", "下单", "优惠价", "限时折扣", "点击购买"]
            found_marketing = [w for w in marketing_words if w in content]
            if found_marketing:
                issues.append(ValidationIssue(
                    "medium",
                    f"疑似营销用语: {', '.join(found_marketing)}",
                    "platform_rule"
                ))
            else:
                passed.append("未检测到过度营销用语")

        elif platform == "zhihu":
            # 营销内容检测
            marketing_words = ["购买", "下单", "优惠价", "限时折扣", "点击购买"]
            found_marketing = [w for w in marketing_words if w in content]
            if found_marketing:
                issues.append(ValidationIssue(
                    "medium",
                    f"疑似营销用语: {', '.join(found_marketing)}",
                    "platform_rule"
                ))
            else:
                passed.append("未检测到过度营销用语")

        elif platform == "xiaohongshu":
            # 字数建议
            content_len = len(content)
            if content_len > 800:
                issues.append(ValidationIssue(
                    "medium",
                    f"正文建议300-800字，当前 {content_len} 字",
                    "content_length"
                ))
            elif content_len < 300:
                issues.append(ValidationIssue(
                    "low",
                    f"正文建议300-800字，当前仅 {content_len} 字",
                    "content_length"
                ))
            else:
                passed.append(f"正文字数适宜（{content_len}字）")

            # 其他平台引流
            cross_platform_keywords = ["微信", "公众号", "抖音号", "微博"]
            found_cross = [p for p in cross_platform_keywords if p in content]
            if found_cross:
                issues.append(ValidationIssue(
                    "high",
                    f"疑似其他平台引流: {', '.join(found_cross)}",
                    "platform_rule"
                ))
            else:
                passed.append("未检测到其他平台引流信息")

        elif platform in ("baijiahao", "toutiao"):
            # 标题党检测
            found_clickbait = clickbait_words & set(title)
            if found_clickbait:
                issues.append(ValidationIssue(
                    "high",
                    f"标题含标题党词汇: {', '.join(found_clickbait)}",
                    "title_content"
                ))
            else:
                passed.append("标题无标题党词汇")

        elif platform == "douyin":
            # 水印检测
            if watermark_patterns.search(content):
                issues.append(ValidationIssue(
                    "high",
                    "内容包含其他平台水印信息",
                    "platform_rule"
                ))
            else:
                passed.append("未检测到其他平台水印")

        return issues, passed


# 导出单例
validator = RuleValidator()