geo/backend/app/services/competitor/competitor_analyzer_service.py

750 lines
28 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import logging
import uuid
from collections import defaultdict
from datetime import datetime, timedelta
from typing import Callable
from sqlalchemy import select, func, and_
from sqlalchemy.ext.asyncio import AsyncSession
from app.database import AsyncSessionLocal
from app.models.brand import Brand
from app.models.competitor import Competitor
from app.models.competitor_insight import CompetitorInsight
from app.models.citation_record import CitationRecord
from app.models.query import Query
from app.services.llm import LLMFactory, LLMError
from app.utils.json_extractor import extract_json
logger = logging.getLogger(__name__)
VALID_ANALYSIS_TYPES = [
"citation_gap",
"content_strategy",
"platform_coverage",
"query_overlap",
"differentiation",
]
class CompetitorAnalyzerService:
async def analyze_competitor(
self,
brand_id: uuid.UUID,
analysis_types: list[str] | None = None,
period_days: int = 30,
progress_callback: Callable[[float, str], None] | None = None,
) -> dict:
if analysis_types is None:
analysis_types = VALID_ANALYSIS_TYPES
invalid = set(analysis_types) - set(VALID_ANALYSIS_TYPES)
if invalid:
raise ValueError(f"不支持的分析类型: {', '.join(invalid)}")
async with AsyncSessionLocal() as session:
brand = await session.get(Brand, brand_id)
if not brand:
raise ValueError(f"品牌不存在: {brand_id}")
if progress_callback:
await progress_callback(0.1, "获取竞品列表...")
competitors = await self._get_competitors(session, brand_id)
if not competitors:
raise ValueError("未找到竞品数据")
if progress_callback:
await progress_callback(0.2, "聚合品牌引用数据...")
brand_citation_data = await self._aggregate_citation_data(
session, brand_id, brand.name, period_days,
)
results = []
total = len(competitors)
for i, competitor in enumerate(competitors):
if progress_callback:
progress = 0.25 + (0.5 * i / total)
await progress_callback(progress, f"分析竞品 {competitor.name}...")
competitor_citation_data = await self._aggregate_citation_data(
session, brand_id, competitor.name, period_days,
)
for analysis_type in analysis_types:
insight = await self._build_insight(
session=session,
brand_id=brand_id,
brand_name=brand.name,
competitor=competitor,
analysis_type=analysis_type,
brand_data=brand_citation_data,
competitor_data=competitor_citation_data,
period_days=period_days,
)
session.add(insight)
results.append(insight)
await session.commit()
for r in results:
await session.refresh(r)
return {
"brand_id": str(brand_id),
"brand_name": brand.name,
"insights": [
{
"id": str(r.id),
"competitor_name": r.competitor_name,
"analysis_type": r.analysis_type,
"citation_count_brand": r.citation_count_brand,
"citation_count_competitor": r.citation_count_competitor,
"sentiment_brand": r.sentiment_brand,
"sentiment_competitor": r.sentiment_competitor,
"platform_breakdown": r.platform_breakdown,
"gap_analysis": r.gap_analysis,
"opportunity_areas": r.opportunity_areas,
"recommendations": r.recommendations,
"confidence": r.confidence,
"period_days": r.period_days,
"created_at": r.created_at.isoformat() if r.created_at else None,
}
for r in results
],
"total": len(results),
}
async def compare_citation_volume(
self,
brand_data: dict,
competitor_data: dict,
) -> dict:
brand_count = brand_data["citation_count"]
competitor_count = competitor_data["citation_count"]
total = brand_count + competitor_count
return {
"brand": brand_count,
"competitor": competitor_count,
"diff": brand_count - competitor_count,
"brand_share": round(brand_count / total, 4) if total > 0 else 0.0,
"competitor_share": round(competitor_count / total, 4) if total > 0 else 0.0,
"by_platform": self._compare_platform_citations(
brand_data, competitor_data,
),
}
async def compare_citation_quality(
self,
brand_data: dict,
competitor_data: dict,
) -> dict:
brand_positive = brand_data.get("positive_ratio", 0.0)
competitor_positive = competitor_data.get("positive_ratio", 0.0)
brand_rank = brand_data.get("avg_rank", 0.0)
competitor_rank = competitor_data.get("avg_rank", 0.0)
return {
"sentiment": {
"brand_positive_ratio": brand_positive,
"competitor_positive_ratio": competitor_positive,
"diff": round(brand_positive - competitor_positive, 4),
},
"ranking": {
"brand_avg_rank": brand_rank,
"competitor_avg_rank": competitor_rank,
"diff": round(brand_rank - competitor_rank, 2),
},
"brand_sentiment_breakdown": brand_data.get("sentiment_breakdown", {}),
"competitor_sentiment_breakdown": competitor_data.get("sentiment_breakdown", {}),
}
async def analyze_content_strategy(
self,
brand_data: dict,
competitor_data: dict,
) -> dict:
brand_types = brand_data.get("content_types", {})
competitor_types = competitor_data.get("content_types", {})
all_types = set(brand_types.keys()) | set(competitor_types.keys())
type_comparison = {}
for ct in all_types:
type_comparison[ct] = {
"brand": brand_types.get(ct, 0),
"competitor": competitor_types.get(ct, 0),
}
competitor_only_types = set(competitor_types.keys()) - set(brand_types.keys())
brand_only_types = set(brand_types.keys()) - set(competitor_types.keys())
return {
"type_comparison": type_comparison,
"competitor_unique_types": list(competitor_only_types),
"brand_unique_types": list(brand_only_types),
"competitor_top_types": sorted(
competitor_types.items(), key=lambda x: x[1], reverse=True,
)[:5],
}
async def identify_opportunities(
self,
brand_data: dict,
competitor_data: dict,
comparison: dict,
) -> dict:
opportunities = []
brand_platforms = set(brand_data["by_platform"].keys())
competitor_platforms = set(competitor_data["by_platform"].keys())
brand_only = brand_platforms - competitor_platforms
if brand_only:
for platform in brand_only:
opportunities.append({
"area": f"platform_{platform}",
"description": f"品牌在{platform}平台有引用而竞品没有,可加大投入建立差异化优势",
"potential": "high",
"action": f"增加在{platform}平台的内容投放和优化",
})
competitor_only = competitor_platforms - brand_platforms
if competitor_only:
for platform in competitor_only:
opportunities.append({
"area": f"platform_{platform}",
"description": f"竞品在{platform}平台有引用而品牌没有,存在进入机会",
"potential": "medium",
"action": f"研究{platform}平台的内容偏好,制定进入策略",
})
citation_volume = comparison.get("citation_volume", {})
if citation_volume.get("diff", 0) > 0:
opportunities.append({
"area": "citation_volume_advantage",
"description": "品牌引用量高于竞品,可强化品牌权威性传播",
"potential": "high",
"action": "收集高引用内容案例,扩大品牌影响力",
})
quality = comparison.get("quality", {})
sentiment = quality.get("sentiment", {})
if sentiment.get("diff", 0) > 0.1:
opportunities.append({
"area": "sentiment_advantage",
"description": "品牌正面引用比例显著高于竞品,可强化正面形象传播",
"potential": "high",
"action": "收集正面引用案例,制作品牌优势内容",
})
if not opportunities:
opportunities.append({
"area": "general",
"description": "当前数据未发现明显差异化机会,建议持续监测并积累更多数据",
"potential": "low",
"action": "增加查询频率和覆盖平台,积累更多引用数据",
})
return {
"opportunities": opportunities,
"total_opportunities": len(opportunities),
"high_potential_count": sum(1 for o in opportunities if o["potential"] == "high"),
}
async def generate_recommendations(
self,
brand_name: str,
competitor_name: str,
comparison: dict,
gaps: dict,
opportunities: dict,
data_sufficiency: str,
) -> dict:
prompt = f"""你是一个专业的GEOGenerative Engine Optimization策略分析师。
请基于以下品牌与竞品的引用对比数据,生成策略建议。
品牌: {brand_name}
竞品: {competitor_name}
数据充分性: {data_sufficiency}
对比数据:
{json.dumps(comparison, ensure_ascii=False, indent=2)}
差距分析:
{json.dumps(gaps, ensure_ascii=False, indent=2)}
机会发现:
{json.dumps(opportunities, ensure_ascii=False, indent=2)}
请返回JSON格式不要包含其他文字:
{{
"gap_closing_strategies": [
{{"strategy": "策略描述", "priority": "high/medium/low", "expected_impact": "预期效果"}}
],
"differentiation_strategies": [
{{"strategy": "策略描述", "priority": "high/medium/low", "expected_impact": "预期效果"}}
],
"quick_wins": [
{{"action": "行动描述", "effort": "low/medium/high", "timeline": "预计时间"}}
],
"long_term_recommendations": [
{{"recommendation": "建议描述", "rationale": "理由"}}
]
}}"""
try:
provider = LLMFactory.get_default()
response = await provider.chat(
[{"role": "user", "content": prompt}],
temperature=0.3,
max_tokens=2000,
)
result = json.loads(extract_json(response.content))
result["usage"] = response.usage
return result
except (LLMError, json.JSONDecodeError, ValueError) as e:
logger.warning(f"LLM策略生成失败使用默认策略: {e}")
return self._default_strategy(gaps, opportunities)
async def calculate_gap_score(
self,
db: AsyncSession,
brand_id: uuid.UUID,
brand_name: str,
) -> list[dict]:
stmt = (
select(CompetitorInsight)
.where(CompetitorInsight.brand_id == brand_id)
.order_by(CompetitorInsight.created_at.desc())
)
result = await db.execute(stmt)
insights = list(result.scalars().all())
competitor_map: dict[str, list[CompetitorInsight]] = defaultdict(list)
for insight in insights:
competitor_map[insight.competitor_name].append(insight)
summaries = []
for comp_name, comp_insights in competitor_map.items():
gap_dimensions = []
score_components = []
for insight in comp_insights:
gap = insight.gap_analysis or {}
if not gap:
continue
for g in gap.get("gaps", []):
dimension = g.get("dimension", "unknown")
severity = g.get("severity", "low")
gap_value = g.get("gap", 0)
severity_score = {"high": 30, "medium": 15, "low": 5}.get(severity, 5)
score_components.append(severity_score)
gap_dimensions.append({
"dimension": dimension,
"severity": severity,
"gap": gap_value,
"analysis_type": insight.analysis_type,
})
overall_score = min(sum(score_components), 100) if score_components else 0.0
summaries.append({
"brand_name": brand_name,
"competitor_name": comp_name,
"gap_dimensions": gap_dimensions,
"overall_gap_score": round(overall_score, 2),
})
return summaries
async def _build_insight(
self,
session: AsyncSession,
brand_id: uuid.UUID,
brand_name: str,
competitor: Competitor,
analysis_type: str,
brand_data: dict,
competitor_data: dict,
period_days: int,
) -> CompetitorInsight:
comparison = {}
comparison["citation_volume"] = await self.compare_citation_volume(
brand_data, competitor_data,
)
comparison["quality"] = await self.compare_citation_quality(
brand_data, competitor_data,
)
gap = self._identify_gaps(comparison, brand_name, competitor.name)
opportunities = await self.identify_opportunities(
brand_data, competitor_data, comparison,
)
data_sufficiency = self._assess_data_sufficiency(brand_data, competitor_data)
insight_data = {}
if analysis_type == "content_strategy":
insight_data = await self.analyze_content_strategy(
brand_data, competitor_data,
)
elif analysis_type == "platform_coverage":
insight_data = comparison["citation_volume"]["by_platform"]
elif analysis_type == "query_overlap":
insight_data = await self._analyze_query_overlap(
session, brand_id, brand_name, competitor.name, period_days,
)
elif analysis_type == "differentiation":
insight_data = {
"brand_unique_platforms": list(
set(brand_data["by_platform"].keys()) - set(competitor_data["by_platform"].keys())
),
"competitor_unique_platforms": list(
set(competitor_data["by_platform"].keys()) - set(brand_data["by_platform"].keys())
),
"sentiment_diff": comparison["quality"].get("sentiment", {}),
}
recommendations = await self.generate_recommendations(
brand_name=brand_name,
competitor_name=competitor.name,
comparison=comparison,
gaps=gap,
opportunities=opportunities,
data_sufficiency=data_sufficiency,
)
confidence = self._determine_confidence(brand_data, competitor_data)
return CompetitorInsight(
brand_id=brand_id,
competitor_name=competitor.name,
analysis_type=analysis_type,
insight_data=insight_data if insight_data else None,
citation_count_brand=brand_data["citation_count"],
citation_count_competitor=competitor_data["citation_count"],
sentiment_brand=brand_data.get("positive_ratio"),
sentiment_competitor=competitor_data.get("positive_ratio"),
platform_breakdown=comparison["citation_volume"]["by_platform"],
gap_analysis=gap,
opportunity_areas=opportunities,
recommendations=recommendations,
confidence=confidence,
period_days=period_days,
)
async def _get_competitors(
self,
db: AsyncSession,
brand_id: uuid.UUID,
) -> list[Competitor]:
stmt = select(Competitor).where(Competitor.brand_id == brand_id)
result = await db.execute(stmt)
return list(result.scalars().all())
async def _aggregate_citation_data(
self,
db: AsyncSession,
brand_id: uuid.UUID,
target_name: str,
period_days: int = 30,
) -> dict:
since = datetime.utcnow() - timedelta(days=period_days)
query_stmt = select(Query).where(Query.brand_id == brand_id)
query_result = await db.execute(query_stmt)
queries = list(query_result.scalars().all())
if not queries:
return {
"citation_count": 0,
"positive_ratio": 0.0,
"avg_rank": 0.0,
"by_platform": {},
"content_types": {},
"sentiment_breakdown": {"positive": 0, "neutral": 0, "negative": 0},
"total_records": 0,
}
query_ids = [q.id for q in queries]
query_aliases = set()
for q in queries:
query_aliases.add(q.target_brand.lower())
if q.brand_aliases:
for alias in q.brand_aliases:
query_aliases.add(alias.lower())
conditions = [CitationRecord.query_id.in_(query_ids)]
if since:
conditions.append(CitationRecord.queried_at >= since)
stmt = select(CitationRecord).where(and_(*conditions))
result = await db.execute(stmt)
records = list(result.scalars().all())
target_lower = target_name.lower()
matching_records = []
for record in records:
if record.cited and record.competitor_brands:
is_target = False
for cb in record.competitor_brands:
if isinstance(cb, str) and cb.lower() == target_lower:
is_target = True
break
elif isinstance(cb, str) and cb.lower() in query_aliases:
is_target = True
break
if is_target:
matching_records.append(record)
elif record.cited and not record.competitor_brands:
matching_records.append(record)
total_citations = len(matching_records)
if total_citations == 0:
return {
"citation_count": 0,
"positive_ratio": 0.0,
"avg_rank": 0.0,
"by_platform": {},
"content_types": {},
"sentiment_breakdown": {"positive": 0, "neutral": 0, "negative": 0},
"total_records": len(records),
}
sentiment_breakdown = {"positive": 0, "neutral": 0, "negative": 0}
for r in matching_records:
s = r.sentiment or "neutral"
if s in sentiment_breakdown:
sentiment_breakdown[s] += 1
else:
sentiment_breakdown["neutral"] += 1
positive_count = sentiment_breakdown["positive"]
positive_ratio = positive_count / total_citations if total_citations > 0 else 0.0
ranks = [
r.citation_position for r in matching_records
if r.citation_position is not None and r.citation_position > 0
]
avg_rank = sum(ranks) / len(ranks) if ranks else 0.0
by_platform = defaultdict(lambda: {"citations": 0, "positive": 0, "ranks": []})
for r in matching_records:
platform = r.platform
by_platform[platform]["citations"] += 1
if r.sentiment == "positive":
by_platform[platform]["positive"] += 1
if r.citation_position is not None and r.citation_position > 0:
by_platform[platform]["ranks"].append(r.citation_position)
platform_stats = {}
for platform, data in by_platform.items():
platform_stats[platform] = {
"citations": data["citations"],
"positive_ratio": data["positive"] / data["citations"] if data["citations"] > 0 else 0.0,
"avg_rank": sum(data["ranks"]) / len(data["ranks"]) if data["ranks"] else 0.0,
}
content_types = defaultdict(int)
for r in matching_records:
match_type = r.match_type or "unknown"
content_types[match_type] += 1
return {
"citation_count": total_citations,
"positive_ratio": round(positive_ratio, 4),
"avg_rank": round(avg_rank, 2),
"by_platform": platform_stats,
"content_types": dict(content_types),
"sentiment_breakdown": sentiment_breakdown,
"total_records": len(records),
}
def _compare_platform_citations(
self,
brand_data: dict,
competitor_data: dict,
) -> dict:
all_platforms = set(brand_data["by_platform"].keys()) | set(competitor_data["by_platform"].keys())
result = {}
for platform in all_platforms:
bp = brand_data["by_platform"].get(platform, {"citations": 0, "positive_ratio": 0.0, "avg_rank": 0.0})
cp = competitor_data["by_platform"].get(platform, {"citations": 0, "positive_ratio": 0.0, "avg_rank": 0.0})
result[platform] = {
"brand": bp,
"competitor": cp,
}
return result
def _identify_gaps(
self,
comparison: dict,
brand_name: str,
competitor_name: str,
) -> dict:
gaps = []
volume = comparison.get("citation_volume", {})
citation_diff = volume.get("diff", 0)
if citation_diff < 0:
gaps.append({
"dimension": "citation_count",
"brand_value": volume.get("brand", 0),
"competitor_value": volume.get("competitor", 0),
"gap": abs(citation_diff),
"severity": "high" if abs(citation_diff) >= 5 else "medium" if abs(citation_diff) >= 2 else "low",
})
quality = comparison.get("quality", {})
sentiment = quality.get("sentiment", {})
positive_diff = sentiment.get("diff", 0)
if positive_diff < -0.1:
gaps.append({
"dimension": "positive_ratio",
"brand_value": sentiment.get("brand_positive_ratio", 0),
"competitor_value": sentiment.get("competitor_positive_ratio", 0),
"gap": abs(positive_diff),
"severity": "high" if abs(positive_diff) >= 0.3 else "medium" if abs(positive_diff) >= 0.15 else "low",
})
ranking = quality.get("ranking", {})
rank_diff = ranking.get("diff", 0)
if rank_diff > 1.0:
gaps.append({
"dimension": "avg_rank",
"brand_value": ranking.get("brand_avg_rank", 0),
"competitor_value": ranking.get("competitor_avg_rank", 0),
"gap": abs(rank_diff),
"severity": "high" if abs(rank_diff) >= 3.0 else "medium" if abs(rank_diff) >= 2.0 else "low",
})
for platform, data in volume.get("by_platform", {}).items():
brand_citations = data.get("brand", {}).get("citations", 0)
competitor_citations = data.get("competitor", {}).get("citations", 0)
if competitor_citations > brand_citations + 2:
gaps.append({
"dimension": f"platform_{platform}",
"brand_value": brand_citations,
"competitor_value": competitor_citations,
"gap": competitor_citations - brand_citations,
"severity": "high" if (competitor_citations - brand_citations) >= 5 else "medium",
})
return {
"brand_name": brand_name,
"competitor_name": competitor_name,
"gaps": gaps,
"total_gaps": len(gaps),
"high_severity_count": sum(1 for g in gaps if g["severity"] == "high"),
}
async def _analyze_query_overlap(
self,
db: AsyncSession,
brand_id: uuid.UUID,
brand_name: str,
competitor_name: str,
period_days: int,
) -> dict:
since = datetime.utcnow() - timedelta(days=period_days)
stmt = select(Query).where(
Query.brand_id == brand_id,
Query.created_at >= since,
)
result = await db.execute(stmt)
queries = list(result.scalars().all())
brand_keywords = set()
competitor_keywords = set()
for q in queries:
keyword = q.keyword.lower()
brand_keywords.add(keyword)
if competitor_name.lower() in keyword or any(
a.lower() in keyword for a in (q.brand_aliases or [])
):
competitor_keywords.add(keyword)
overlap = brand_keywords & competitor_keywords
brand_only = brand_keywords - competitor_keywords
competitor_only = competitor_keywords - brand_keywords
return {
"brand_keyword_count": len(brand_keywords),
"competitor_keyword_count": len(competitor_keywords),
"overlap_count": len(overlap),
"overlap_keywords": list(overlap)[:20],
"brand_only_count": len(brand_only),
"competitor_only_count": len(competitor_only),
"overlap_ratio": round(len(overlap) / len(brand_keywords), 4) if brand_keywords else 0.0,
}
def _assess_data_sufficiency(
self,
brand_data: dict,
competitor_data: dict,
) -> str:
brand_count = brand_data["citation_count"]
competitor_count = competitor_data["citation_count"]
min_count = min(brand_count, competitor_count)
if min_count > 10:
return "sufficient"
elif min_count >= 5:
return "limited"
else:
return "insufficient"
def _determine_confidence(
self,
brand_data: dict,
competitor_data: dict,
) -> str:
brand_count = brand_data["citation_count"]
competitor_count = competitor_data["citation_count"]
min_count = min(brand_count, competitor_count)
if min_count > 20:
return "high"
elif min_count >= 5:
return "medium"
else:
return "low"
def _default_strategy(self, gaps: dict, opportunities: dict) -> dict:
gap_strategies = []
for gap in gaps.get("gaps", []):
gap_strategies.append({
"strategy": f"提升{gap['dimension']}维度表现,缩小与竞品差距",
"priority": gap["severity"],
"expected_impact": f"预计可将{gap['dimension']}差距缩小{gap['gap'] * 0.5:.1f}",
})
diff_strategies = []
for opp in opportunities.get("opportunities", []):
if opp["potential"] in ("high", "medium"):
diff_strategies.append({
"strategy": opp["action"],
"priority": opp["potential"],
"expected_impact": "建立差异化竞争优势",
})
return {
"gap_closing_strategies": gap_strategies[:5],
"differentiation_strategies": diff_strategies[:5],
"quick_wins": [],
"long_term_recommendations": [
{
"recommendation": "持续监测竞品引用数据变化,定期更新策略",
"rationale": "GEO优化是长期过程需要持续迭代",
}
],
}