geo/backend/app/services/analysis/sentiment_service.py

"""
情感分析服务 - 使用DeepSeek API分析AI回答中对品牌的情感倾向
"""
import asyncio
import hashlib
import json
import logging
import time
from typing import Optional

from app.config import settings
from app.utils.json_extractor import extract_json

logger = logging.getLogger(__name__)

# 情感分析Prompt模板
SENTIMENT_ANALYSIS_PROMPT = """分析以下AI回答中对品牌"{brand_name}"的情感倾向。

请严格返回以下JSON格式，不要包含其他内容：
{{
  "sentiment": "positive" | "neutral" | "negative",
  "confidence": 0.0到1.0之间的浮点数,
  "key_phrases": ["关键情感短语1", "关键情感短语2"],
  "reasoning": "判断理由的简要说明"
}}

判断标准：
- positive: AI明确推荐、赞扬、肯定该品牌，或在对比中明显偏向该品牌
- neutral: AI客观提及该品牌，无明确褒贬，或褒贬平衡
- negative: AI批评、否定、不推荐该品牌，或在对比中明显贬低该品牌
- confidence: 判断的置信度，0.0表示完全不确定，1.0表示完全确定

AI回答内容：
{content}"""


class SentimentResult:
    """情感分析结果"""

    def __init__(
        self,
        sentiment: str,
        confidence: float,
        key_phrases: list[str],
        reasoning: str,
    ):
        self.sentiment = sentiment  # positive / neutral / negative
        self.confidence = confidence  # 0.0 - 1.0
        self.key_phrases = key_phrases  # 关键情感短语
        self.reasoning = reasoning  # 判断理由

    def to_dict(self) -> dict:
        return {
            "sentiment": self.sentiment,
            "confidence": self.confidence,
            "key_phrases": self.key_phrases,
            "reasoning": self.reasoning,
        }


class SentimentCache:
    """
    情感分析结果缓存

    使用内存缓存，相同内容不重复调用LLM。
    缓存键为 brand_name + content 的哈希值。
    """

    def __init__(self, max_size: int = 1000, ttl_seconds: int = 86400):
        self._cache: dict[str, tuple[SentimentResult, float]] = {}
        self._max_size = max_size
        self._ttl_seconds = ttl_seconds

    def _make_key(self, brand_name: str, content: str) -> str:
        """生成缓存键"""
        raw = f"{brand_name}::{content}"
        return hashlib.sha256(raw.encode("utf-8")).hexdigest()

    def get(self, brand_name: str, content: str) -> Optional[SentimentResult]:
        """获取缓存结果"""
        key = self._make_key(brand_name, content)
        entry = self._cache.get(key)
        if entry is None:
            return None
        result, timestamp = entry
        if time.time() - timestamp > self._ttl_seconds:
            del self._cache[key]
            return None
        return result

    def set(self, brand_name: str, content: str, result: SentimentResult) -> None:
        """设置缓存结果"""
        # 清理过期缓存
        if len(self._cache) >= self._max_size:
            self._evict_expired()
            if len(self._cache) >= self._max_size:
                # 按时间排序，删除最旧的10%
                sorted_items = sorted(
                    self._cache.items(), key=lambda x: x[1][1]
                )
                evict_count = max(1, len(sorted_items) // 10)
                for k, _ in sorted_items[:evict_count]:
                    del self._cache[k]

        key = self._make_key(brand_name, content)
        self._cache[key] = (result, time.time())

    def _evict_expired(self) -> None:
        """清理过期缓存"""
        now = time.time()
        expired_keys = [
            k for k, (_, ts) in self._cache.items()
            if now - ts > self._ttl_seconds
        ]
        for k in expired_keys:
            del self._cache[k]

    def clear(self) -> None:
        """清空缓存"""
        self._cache.clear()


class SentimentAnalysisService:
    """
    情感分析服务

    使用DeepSeek API分析AI回答中对品牌的情感倾向。
    支持：
    - 缓存：相同内容不重复调用LLM
    - 重试：API调用失败时自动重试
    - 开关：通过ENABLE_LLM环境变量控制
    - 降级：LLM不可用时使用基于规则的分析
    """

    def __init__(
        self,
        api_key: Optional[str] = None,
        max_retries: int = 3,
        retry_delay: float = 1.0,
    ):
        self.api_key = api_key or settings.DEEPSEEK_API_KEY
        self.max_retries = max_retries
        self.retry_delay = retry_delay
        self._client = None
        self._cache = SentimentCache()

    @property
    def client(self):
        """延迟初始化DeepSeek客户端"""
        if self._client is None:
            try:
                from openai import OpenAI

                self._client = OpenAI(
                    api_key=self.api_key,
                    base_url="https://api.deepseek.com",
                )
            except ImportError:
                raise RuntimeError(
                    "请安装openai库: pip install openai"
                )
        return self._client

    async def analyze(
        self,
        brand_name: str,
        content: str,
    ) -> SentimentResult:
        """
        分析AI回答中对品牌的情感倾向

        Args:
            brand_name: 品牌名称
            content: AI回答内容

        Returns:
            SentimentResult: 情感分析结果
        """
        if not content or not content.strip():
            return SentimentResult(
                sentiment="neutral",
                confidence=0.0,
                key_phrases=[],
                reasoning="内容为空，无法分析",
            )

        # 检查缓存
        cached = self._cache.get(brand_name, content)
        if cached is not None:
            logger.debug(f"情感分析命中缓存: brand={brand_name}")
            return cached

        # 检查LLM开关
        if not settings.ENABLE_LLM or not self.api_key:
            logger.info(
                f"LLM情感分析未启用 (ENABLE_LLM={settings.ENABLE_LLM}, "
                f"has_api_key={bool(self.api_key)})，使用规则分析"
            )
            result = self._rule_based_analysis(brand_name, content)
            self._cache.set(brand_name, content, result)
            return result

        # 调用DeepSeek API
        result = await self._call_with_retry(brand_name, content)
        self._cache.set(brand_name, content, result)
        return result

    async def _call_with_retry(
        self,
        brand_name: str,
        content: str,
    ) -> SentimentResult:
        """带重试的API调用"""
        last_error = None
        for attempt in range(self.max_retries):
            try:
                result = await self._call_deepseek(brand_name, content)
                return result
            except Exception as e:
                last_error = e
                logger.warning(
                    f"情感分析API调用失败 "
                    f"(尝试 {attempt + 1}/{self.max_retries}): {e}"
                )
                if attempt < self.max_retries - 1:
                    # 指数退避
                    delay = self.retry_delay * (2 ** attempt)
                    await asyncio.sleep(delay)

        # 所有重试失败，降级到规则分析
        logger.error(
            f"情感分析API调用失败，已重试{self.max_retries}次，"
            f"降级到规则分析: {last_error}"
        )
        return self._rule_based_analysis(brand_name, content)

    async def _call_deepseek(
        self,
        brand_name: str,
        content: str,
    ) -> SentimentResult:
        """调用DeepSeek API进行情感分析"""
        prompt = SENTIMENT_ANALYSIS_PROMPT.format(
            brand_name=brand_name,
            content=content[:3000],  # 限制内容长度，避免token过多
        )

        # 在线程池中执行同步的API调用
        response_dict = await asyncio.to_thread(
            self._sync_call_deepseek, prompt
        )

        return self._parse_response(response_dict)

    def _sync_call_deepseek(self, prompt: str) -> dict:
        """同步调用DeepSeek API"""
        response = self.client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {
                    "role": "system",
                    "content": (
                        "你是一个专业的品牌情感分析专家。"
                        "你的任务是分析AI回答中对特定品牌的情感倾向。"
                        "请严格按照要求的JSON格式返回结果。"
                    ),
                },
                {"role": "user", "content": prompt},
            ],
            temperature=0.1,
            max_tokens=500,
        )

        content = response.choices[0].message.content
        if not content:
            raise RuntimeError("API返回空响应")

        # 提取JSON
        try:
            json_str = extract_json(content)
        except ValueError as e:
            raise RuntimeError(str(e)) from e
        return json.loads(json_str)

    def _parse_response(self, response: dict) -> SentimentResult:
        """解析API响应"""
        sentiment = str(response.get("sentiment", "neutral")).lower()
        if sentiment not in ("positive", "neutral", "negative"):
            sentiment = "neutral"

        confidence = float(response.get("confidence", 0.5))
        confidence = max(0.0, min(1.0, confidence))

        key_phrases = response.get("key_phrases", [])
        if not isinstance(key_phrases, list):
            key_phrases = []
        key_phrases = [str(p) for p in key_phrases[:10]]  # 最多10个短语

        reasoning = str(response.get("reasoning", ""))

        return SentimentResult(
            sentiment=sentiment,
            confidence=confidence,
            key_phrases=key_phrases,
            reasoning=reasoning,
        )

    def _rule_based_analysis(
        self,
        brand_name: str,
        content: str,
    ) -> SentimentResult:
        """
        基于规则的情感分析（降级方案）

        当LLM不可用时使用关键词匹配进行简单分析
        """
        # 正面关键词
        positive_keywords = [
            "推荐", "领先", "优秀", "首选", "最佳", "出色", "卓越",
            "优势", "创新", "专业", "值得", "信赖", "优质", "好评",
            "突出", "领先地位", "行业标杆", "第一", "top", "best",
            "领先者", "佼佼者", "知名", "著名", "口碑好",
        ]
        # 负面关键词
        negative_keywords = [
            "不足", "缺陷", "问题", "较差", "落后", "劣势", "不推荐",
            "差评", "投诉", "风险", "隐患", "短板", "弱点", "不足之处",
            "有待改善", "不及", "逊色", "劣势", "负面",
        ]

        content_lower = content.lower()
        positive_matches = [kw for kw in positive_keywords if kw in content_lower]
        negative_matches = [kw for kw in negative_keywords if kw in content_lower]

        positive_count = len(positive_matches)
        negative_count = len(negative_matches)

        if positive_count > negative_count:
            sentiment = "positive"
            confidence = min(0.9, 0.5 + positive_count * 0.1)
            key_phrases = positive_matches[:5]
            reasoning = f"检测到{positive_count}个正面关键词"
        elif negative_count > positive_count:
            sentiment = "negative"
            confidence = min(0.9, 0.5 + negative_count * 0.1)
            key_phrases = negative_matches[:5]
            reasoning = f"检测到{negative_count}个负面关键词"
        else:
            sentiment = "neutral"
            confidence = 0.5
            key_phrases = positive_matches[:3] + negative_matches[:3]
            if positive_count == 0 and negative_count == 0:
                reasoning = "未检测到明显情感倾向关键词"
            else:
                reasoning = f"正面和负面关键词数量相当({positive_count}vs{negative_count})"

        return SentimentResult(
            sentiment=sentiment,
            confidence=confidence,
            key_phrases=key_phrases,
            reasoning=reasoning,
        )

    async def batch_analyze(
        self,
        items: list[tuple[str, str]],
    ) -> list[SentimentResult]:
        """
        批量情感分析

        Args:
            items: [(brand_name, content), ...] 列表

        Returns:
            SentimentResult列表，顺序与输入一致
        """
        results = []
        for brand_name, content in items:
            result = await self.analyze(brand_name, content)
            results.append(result)
        return results

    def clear_cache(self) -> None:
        """清空缓存"""
        self._cache.clear()


# 全局单例
_sentiment_service: Optional[SentimentAnalysisService] = None


def get_sentiment_service() -> SentimentAnalysisService:
    """获取情感分析服务单例"""
    global _sentiment_service
    if _sentiment_service is None:
        _sentiment_service = SentimentAnalysisService()
    return _sentiment_service