geo/backend/app/services/analysis/sentiment_service.py

399 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
情感分析服务 - 使用DeepSeek API分析AI回答中对品牌的情感倾向
"""
import asyncio
import hashlib
import json
import logging
import time
from typing import Optional
from app.config import settings
from app.utils.json_extractor import extract_json
logger = logging.getLogger(__name__)
# 情感分析Prompt模板
SENTIMENT_ANALYSIS_PROMPT = """分析以下AI回答中对品牌"{brand_name}"的情感倾向。
请严格返回以下JSON格式不要包含其他内容
{{
"sentiment": "positive" | "neutral" | "negative",
"confidence": 0.0到1.0之间的浮点数,
"key_phrases": ["关键情感短语1", "关键情感短语2"],
"reasoning": "判断理由的简要说明"
}}
判断标准:
- positive: AI明确推荐、赞扬、肯定该品牌或在对比中明显偏向该品牌
- neutral: AI客观提及该品牌无明确褒贬或褒贬平衡
- negative: AI批评、否定、不推荐该品牌或在对比中明显贬低该品牌
- confidence: 判断的置信度0.0表示完全不确定1.0表示完全确定
AI回答内容
{content}"""
class SentimentResult:
"""情感分析结果"""
def __init__(
self,
sentiment: str,
confidence: float,
key_phrases: list[str],
reasoning: str,
):
self.sentiment = sentiment # positive / neutral / negative
self.confidence = confidence # 0.0 - 1.0
self.key_phrases = key_phrases # 关键情感短语
self.reasoning = reasoning # 判断理由
def to_dict(self) -> dict:
return {
"sentiment": self.sentiment,
"confidence": self.confidence,
"key_phrases": self.key_phrases,
"reasoning": self.reasoning,
}
class SentimentCache:
"""
情感分析结果缓存
使用内存缓存相同内容不重复调用LLM。
缓存键为 brand_name + content 的哈希值。
"""
def __init__(self, max_size: int = 1000, ttl_seconds: int = 86400):
self._cache: dict[str, tuple[SentimentResult, float]] = {}
self._max_size = max_size
self._ttl_seconds = ttl_seconds
def _make_key(self, brand_name: str, content: str) -> str:
"""生成缓存键"""
raw = f"{brand_name}::{content}"
return hashlib.sha256(raw.encode("utf-8")).hexdigest()
def get(self, brand_name: str, content: str) -> Optional[SentimentResult]:
"""获取缓存结果"""
key = self._make_key(brand_name, content)
entry = self._cache.get(key)
if entry is None:
return None
result, timestamp = entry
if time.time() - timestamp > self._ttl_seconds:
del self._cache[key]
return None
return result
def set(self, brand_name: str, content: str, result: SentimentResult) -> None:
"""设置缓存结果"""
# 清理过期缓存
if len(self._cache) >= self._max_size:
self._evict_expired()
if len(self._cache) >= self._max_size:
# 按时间排序删除最旧的10%
sorted_items = sorted(
self._cache.items(), key=lambda x: x[1][1]
)
evict_count = max(1, len(sorted_items) // 10)
for k, _ in sorted_items[:evict_count]:
del self._cache[k]
key = self._make_key(brand_name, content)
self._cache[key] = (result, time.time())
def _evict_expired(self) -> None:
"""清理过期缓存"""
now = time.time()
expired_keys = [
k for k, (_, ts) in self._cache.items()
if now - ts > self._ttl_seconds
]
for k in expired_keys:
del self._cache[k]
def clear(self) -> None:
"""清空缓存"""
self._cache.clear()
class SentimentAnalysisService:
"""
情感分析服务
使用DeepSeek API分析AI回答中对品牌的情感倾向。
支持:
- 缓存相同内容不重复调用LLM
- 重试API调用失败时自动重试
- 开关通过ENABLE_LLM环境变量控制
- 降级LLM不可用时使用基于规则的分析
"""
def __init__(
self,
api_key: Optional[str] = None,
max_retries: int = 3,
retry_delay: float = 1.0,
):
self.api_key = api_key or settings.DEEPSEEK_API_KEY
self.max_retries = max_retries
self.retry_delay = retry_delay
self._client = None
self._cache = SentimentCache()
@property
def client(self):
"""延迟初始化DeepSeek客户端"""
if self._client is None:
try:
from openai import OpenAI
self._client = OpenAI(
api_key=self.api_key,
base_url="https://api.deepseek.com",
)
except ImportError:
raise RuntimeError(
"请安装openai库: pip install openai"
)
return self._client
async def analyze(
self,
brand_name: str,
content: str,
) -> SentimentResult:
"""
分析AI回答中对品牌的情感倾向
Args:
brand_name: 品牌名称
content: AI回答内容
Returns:
SentimentResult: 情感分析结果
"""
if not content or not content.strip():
return SentimentResult(
sentiment="neutral",
confidence=0.0,
key_phrases=[],
reasoning="内容为空,无法分析",
)
# 检查缓存
cached = self._cache.get(brand_name, content)
if cached is not None:
logger.debug(f"情感分析命中缓存: brand={brand_name}")
return cached
# 检查LLM开关
if not settings.ENABLE_LLM or not self.api_key:
logger.info(
f"LLM情感分析未启用 (ENABLE_LLM={settings.ENABLE_LLM}, "
f"has_api_key={bool(self.api_key)}),使用规则分析"
)
result = self._rule_based_analysis(brand_name, content)
self._cache.set(brand_name, content, result)
return result
# 调用DeepSeek API
result = await self._call_with_retry(brand_name, content)
self._cache.set(brand_name, content, result)
return result
async def _call_with_retry(
self,
brand_name: str,
content: str,
) -> SentimentResult:
"""带重试的API调用"""
last_error = None
for attempt in range(self.max_retries):
try:
result = await self._call_deepseek(brand_name, content)
return result
except Exception as e:
last_error = e
logger.warning(
f"情感分析API调用失败 "
f"(尝试 {attempt + 1}/{self.max_retries}): {e}"
)
if attempt < self.max_retries - 1:
# 指数退避
delay = self.retry_delay * (2 ** attempt)
await asyncio.sleep(delay)
# 所有重试失败,降级到规则分析
logger.error(
f"情感分析API调用失败已重试{self.max_retries}次,"
f"降级到规则分析: {last_error}"
)
return self._rule_based_analysis(brand_name, content)
async def _call_deepseek(
self,
brand_name: str,
content: str,
) -> SentimentResult:
"""调用DeepSeek API进行情感分析"""
prompt = SENTIMENT_ANALYSIS_PROMPT.format(
brand_name=brand_name,
content=content[:3000], # 限制内容长度避免token过多
)
# 在线程池中执行同步的API调用
response_dict = await asyncio.to_thread(
self._sync_call_deepseek, prompt
)
return self._parse_response(response_dict)
def _sync_call_deepseek(self, prompt: str) -> dict:
"""同步调用DeepSeek API"""
response = self.client.chat.completions.create(
model="deepseek-chat",
messages=[
{
"role": "system",
"content": (
"你是一个专业的品牌情感分析专家。"
"你的任务是分析AI回答中对特定品牌的情感倾向。"
"请严格按照要求的JSON格式返回结果。"
),
},
{"role": "user", "content": prompt},
],
temperature=0.1,
max_tokens=500,
)
content = response.choices[0].message.content
if not content:
raise RuntimeError("API返回空响应")
# 提取JSON
try:
json_str = extract_json(content)
except ValueError as e:
raise RuntimeError(str(e)) from e
return json.loads(json_str)
def _parse_response(self, response: dict) -> SentimentResult:
"""解析API响应"""
sentiment = str(response.get("sentiment", "neutral")).lower()
if sentiment not in ("positive", "neutral", "negative"):
sentiment = "neutral"
confidence = float(response.get("confidence", 0.5))
confidence = max(0.0, min(1.0, confidence))
key_phrases = response.get("key_phrases", [])
if not isinstance(key_phrases, list):
key_phrases = []
key_phrases = [str(p) for p in key_phrases[:10]] # 最多10个短语
reasoning = str(response.get("reasoning", ""))
return SentimentResult(
sentiment=sentiment,
confidence=confidence,
key_phrases=key_phrases,
reasoning=reasoning,
)
def _rule_based_analysis(
self,
brand_name: str,
content: str,
) -> SentimentResult:
"""
基于规则的情感分析(降级方案)
当LLM不可用时使用关键词匹配进行简单分析
"""
# 正面关键词
positive_keywords = [
"推荐", "领先", "优秀", "首选", "最佳", "出色", "卓越",
"优势", "创新", "专业", "值得", "信赖", "优质", "好评",
"突出", "领先地位", "行业标杆", "第一", "top", "best",
"领先者", "佼佼者", "知名", "著名", "口碑好",
]
# 负面关键词
negative_keywords = [
"不足", "缺陷", "问题", "较差", "落后", "劣势", "不推荐",
"差评", "投诉", "风险", "隐患", "短板", "弱点", "不足之处",
"有待改善", "不及", "逊色", "劣势", "负面",
]
content_lower = content.lower()
positive_matches = [kw for kw in positive_keywords if kw in content_lower]
negative_matches = [kw for kw in negative_keywords if kw in content_lower]
positive_count = len(positive_matches)
negative_count = len(negative_matches)
if positive_count > negative_count:
sentiment = "positive"
confidence = min(0.9, 0.5 + positive_count * 0.1)
key_phrases = positive_matches[:5]
reasoning = f"检测到{positive_count}个正面关键词"
elif negative_count > positive_count:
sentiment = "negative"
confidence = min(0.9, 0.5 + negative_count * 0.1)
key_phrases = negative_matches[:5]
reasoning = f"检测到{negative_count}个负面关键词"
else:
sentiment = "neutral"
confidence = 0.5
key_phrases = positive_matches[:3] + negative_matches[:3]
if positive_count == 0 and negative_count == 0:
reasoning = "未检测到明显情感倾向关键词"
else:
reasoning = f"正面和负面关键词数量相当({positive_count}vs{negative_count})"
return SentimentResult(
sentiment=sentiment,
confidence=confidence,
key_phrases=key_phrases,
reasoning=reasoning,
)
async def batch_analyze(
self,
items: list[tuple[str, str]],
) -> list[SentimentResult]:
"""
批量情感分析
Args:
items: [(brand_name, content), ...] 列表
Returns:
SentimentResult列表顺序与输入一致
"""
results = []
for brand_name, content in items:
result = await self.analyze(brand_name, content)
results.append(result)
return results
def clear_cache(self) -> None:
"""清空缓存"""
self._cache.clear()
# 全局单例
_sentiment_service: Optional[SentimentAnalysisService] = None
def get_sentiment_service() -> SentimentAnalysisService:
"""获取情感分析服务单例"""
global _sentiment_service
if _sentiment_service is None:
_sentiment_service = SentimentAnalysisService()
return _sentiment_service