import json import logging import uuid from collections import defaultdict from datetime import datetime, timedelta from typing import Callable from sqlalchemy import select, func, and_ from sqlalchemy.ext.asyncio import AsyncSession from app.database import AsyncSessionLocal from app.models.brand import Brand from app.models.competitor import Competitor from app.models.competitor_insight import CompetitorInsight from app.models.citation_record import CitationRecord from app.models.query import Query from app.services.llm import LLMFactory, LLMError from app.utils.json_extractor import extract_json logger = logging.getLogger(__name__) VALID_ANALYSIS_TYPES = [ "citation_gap", "content_strategy", "platform_coverage", "query_overlap", "differentiation", ] class CompetitorAnalyzerService: async def analyze_competitor( self, brand_id: uuid.UUID, analysis_types: list[str] | None = None, period_days: int = 30, progress_callback: Callable[[float, str], None] | None = None, ) -> dict: if analysis_types is None: analysis_types = VALID_ANALYSIS_TYPES invalid = set(analysis_types) - set(VALID_ANALYSIS_TYPES) if invalid: raise ValueError(f"不支持的分析类型: {', '.join(invalid)}") async with AsyncSessionLocal() as session: brand = await session.get(Brand, brand_id) if not brand: raise ValueError(f"品牌不存在: {brand_id}") if progress_callback: await progress_callback(0.1, "获取竞品列表...") competitors = await self._get_competitors(session, brand_id) if not competitors: raise ValueError("未找到竞品数据") if progress_callback: await progress_callback(0.2, "聚合品牌引用数据...") brand_citation_data = await self._aggregate_citation_data( session, brand_id, brand.name, period_days, ) results = [] total = len(competitors) for i, competitor in enumerate(competitors): if progress_callback: progress = 0.25 + (0.5 * i / total) await progress_callback(progress, f"分析竞品 {competitor.name}...") competitor_citation_data = await self._aggregate_citation_data( session, brand_id, competitor.name, period_days, ) for analysis_type in analysis_types: insight = await self._build_insight( session=session, brand_id=brand_id, brand_name=brand.name, competitor=competitor, analysis_type=analysis_type, brand_data=brand_citation_data, competitor_data=competitor_citation_data, period_days=period_days, ) session.add(insight) results.append(insight) await session.commit() for r in results: await session.refresh(r) return { "brand_id": str(brand_id), "brand_name": brand.name, "insights": [ { "id": str(r.id), "competitor_name": r.competitor_name, "analysis_type": r.analysis_type, "citation_count_brand": r.citation_count_brand, "citation_count_competitor": r.citation_count_competitor, "sentiment_brand": r.sentiment_brand, "sentiment_competitor": r.sentiment_competitor, "platform_breakdown": r.platform_breakdown, "gap_analysis": r.gap_analysis, "opportunity_areas": r.opportunity_areas, "recommendations": r.recommendations, "confidence": r.confidence, "period_days": r.period_days, "created_at": r.created_at.isoformat() if r.created_at else None, } for r in results ], "total": len(results), } async def compare_citation_volume( self, brand_data: dict, competitor_data: dict, ) -> dict: brand_count = brand_data["citation_count"] competitor_count = competitor_data["citation_count"] total = brand_count + competitor_count return { "brand": brand_count, "competitor": competitor_count, "diff": brand_count - competitor_count, "brand_share": round(brand_count / total, 4) if total > 0 else 0.0, "competitor_share": round(competitor_count / total, 4) if total > 0 else 0.0, "by_platform": self._compare_platform_citations( brand_data, competitor_data, ), } async def compare_citation_quality( self, brand_data: dict, competitor_data: dict, ) -> dict: brand_positive = brand_data.get("positive_ratio", 0.0) competitor_positive = competitor_data.get("positive_ratio", 0.0) brand_rank = brand_data.get("avg_rank", 0.0) competitor_rank = competitor_data.get("avg_rank", 0.0) return { "sentiment": { "brand_positive_ratio": brand_positive, "competitor_positive_ratio": competitor_positive, "diff": round(brand_positive - competitor_positive, 4), }, "ranking": { "brand_avg_rank": brand_rank, "competitor_avg_rank": competitor_rank, "diff": round(brand_rank - competitor_rank, 2), }, "brand_sentiment_breakdown": brand_data.get("sentiment_breakdown", {}), "competitor_sentiment_breakdown": competitor_data.get("sentiment_breakdown", {}), } async def analyze_content_strategy( self, brand_data: dict, competitor_data: dict, ) -> dict: brand_types = brand_data.get("content_types", {}) competitor_types = competitor_data.get("content_types", {}) all_types = set(brand_types.keys()) | set(competitor_types.keys()) type_comparison = {} for ct in all_types: type_comparison[ct] = { "brand": brand_types.get(ct, 0), "competitor": competitor_types.get(ct, 0), } competitor_only_types = set(competitor_types.keys()) - set(brand_types.keys()) brand_only_types = set(brand_types.keys()) - set(competitor_types.keys()) return { "type_comparison": type_comparison, "competitor_unique_types": list(competitor_only_types), "brand_unique_types": list(brand_only_types), "competitor_top_types": sorted( competitor_types.items(), key=lambda x: x[1], reverse=True, )[:5], } async def identify_opportunities( self, brand_data: dict, competitor_data: dict, comparison: dict, ) -> dict: opportunities = [] brand_platforms = set(brand_data["by_platform"].keys()) competitor_platforms = set(competitor_data["by_platform"].keys()) brand_only = brand_platforms - competitor_platforms if brand_only: for platform in brand_only: opportunities.append({ "area": f"platform_{platform}", "description": f"品牌在{platform}平台有引用而竞品没有,可加大投入建立差异化优势", "potential": "high", "action": f"增加在{platform}平台的内容投放和优化", }) competitor_only = competitor_platforms - brand_platforms if competitor_only: for platform in competitor_only: opportunities.append({ "area": f"platform_{platform}", "description": f"竞品在{platform}平台有引用而品牌没有,存在进入机会", "potential": "medium", "action": f"研究{platform}平台的内容偏好,制定进入策略", }) citation_volume = comparison.get("citation_volume", {}) if citation_volume.get("diff", 0) > 0: opportunities.append({ "area": "citation_volume_advantage", "description": "品牌引用量高于竞品,可强化品牌权威性传播", "potential": "high", "action": "收集高引用内容案例,扩大品牌影响力", }) quality = comparison.get("quality", {}) sentiment = quality.get("sentiment", {}) if sentiment.get("diff", 0) > 0.1: opportunities.append({ "area": "sentiment_advantage", "description": "品牌正面引用比例显著高于竞品,可强化正面形象传播", "potential": "high", "action": "收集正面引用案例,制作品牌优势内容", }) if not opportunities: opportunities.append({ "area": "general", "description": "当前数据未发现明显差异化机会,建议持续监测并积累更多数据", "potential": "low", "action": "增加查询频率和覆盖平台,积累更多引用数据", }) return { "opportunities": opportunities, "total_opportunities": len(opportunities), "high_potential_count": sum(1 for o in opportunities if o["potential"] == "high"), } async def generate_recommendations( self, brand_name: str, competitor_name: str, comparison: dict, gaps: dict, opportunities: dict, data_sufficiency: str, ) -> dict: prompt = f"""你是一个专业的GEO(Generative Engine Optimization)策略分析师。 请基于以下品牌与竞品的引用对比数据,生成策略建议。 品牌: {brand_name} 竞品: {competitor_name} 数据充分性: {data_sufficiency} 对比数据: {json.dumps(comparison, ensure_ascii=False, indent=2)} 差距分析: {json.dumps(gaps, ensure_ascii=False, indent=2)} 机会发现: {json.dumps(opportunities, ensure_ascii=False, indent=2)} 请返回JSON格式(不要包含其他文字): {{ "gap_closing_strategies": [ {{"strategy": "策略描述", "priority": "high/medium/low", "expected_impact": "预期效果"}} ], "differentiation_strategies": [ {{"strategy": "策略描述", "priority": "high/medium/low", "expected_impact": "预期效果"}} ], "quick_wins": [ {{"action": "行动描述", "effort": "low/medium/high", "timeline": "预计时间"}} ], "long_term_recommendations": [ {{"recommendation": "建议描述", "rationale": "理由"}} ] }}""" try: provider = LLMFactory.get_default() response = await provider.chat( [{"role": "user", "content": prompt}], temperature=0.3, max_tokens=2000, ) result = json.loads(extract_json(response.content)) result["usage"] = response.usage return result except (LLMError, json.JSONDecodeError, ValueError) as e: logger.warning(f"LLM策略生成失败,使用默认策略: {e}") return self._default_strategy(gaps, opportunities) async def calculate_gap_score( self, db: AsyncSession, brand_id: uuid.UUID, brand_name: str, ) -> list[dict]: stmt = ( select(CompetitorInsight) .where(CompetitorInsight.brand_id == brand_id) .order_by(CompetitorInsight.created_at.desc()) ) result = await db.execute(stmt) insights = list(result.scalars().all()) competitor_map: dict[str, list[CompetitorInsight]] = defaultdict(list) for insight in insights: competitor_map[insight.competitor_name].append(insight) summaries = [] for comp_name, comp_insights in competitor_map.items(): gap_dimensions = [] score_components = [] for insight in comp_insights: gap = insight.gap_analysis or {} if not gap: continue for g in gap.get("gaps", []): dimension = g.get("dimension", "unknown") severity = g.get("severity", "low") gap_value = g.get("gap", 0) severity_score = {"high": 30, "medium": 15, "low": 5}.get(severity, 5) score_components.append(severity_score) gap_dimensions.append({ "dimension": dimension, "severity": severity, "gap": gap_value, "analysis_type": insight.analysis_type, }) overall_score = min(sum(score_components), 100) if score_components else 0.0 summaries.append({ "brand_name": brand_name, "competitor_name": comp_name, "gap_dimensions": gap_dimensions, "overall_gap_score": round(overall_score, 2), }) return summaries async def _build_insight( self, session: AsyncSession, brand_id: uuid.UUID, brand_name: str, competitor: Competitor, analysis_type: str, brand_data: dict, competitor_data: dict, period_days: int, ) -> CompetitorInsight: comparison = {} comparison["citation_volume"] = await self.compare_citation_volume( brand_data, competitor_data, ) comparison["quality"] = await self.compare_citation_quality( brand_data, competitor_data, ) gap = self._identify_gaps(comparison, brand_name, competitor.name) opportunities = await self.identify_opportunities( brand_data, competitor_data, comparison, ) data_sufficiency = self._assess_data_sufficiency(brand_data, competitor_data) insight_data = {} if analysis_type == "content_strategy": insight_data = await self.analyze_content_strategy( brand_data, competitor_data, ) elif analysis_type == "platform_coverage": insight_data = comparison["citation_volume"]["by_platform"] elif analysis_type == "query_overlap": insight_data = await self._analyze_query_overlap( session, brand_id, brand_name, competitor.name, period_days, ) elif analysis_type == "differentiation": insight_data = { "brand_unique_platforms": list( set(brand_data["by_platform"].keys()) - set(competitor_data["by_platform"].keys()) ), "competitor_unique_platforms": list( set(competitor_data["by_platform"].keys()) - set(brand_data["by_platform"].keys()) ), "sentiment_diff": comparison["quality"].get("sentiment", {}), } recommendations = await self.generate_recommendations( brand_name=brand_name, competitor_name=competitor.name, comparison=comparison, gaps=gap, opportunities=opportunities, data_sufficiency=data_sufficiency, ) confidence = self._determine_confidence(brand_data, competitor_data) return CompetitorInsight( brand_id=brand_id, competitor_name=competitor.name, analysis_type=analysis_type, insight_data=insight_data if insight_data else None, citation_count_brand=brand_data["citation_count"], citation_count_competitor=competitor_data["citation_count"], sentiment_brand=brand_data.get("positive_ratio"), sentiment_competitor=competitor_data.get("positive_ratio"), platform_breakdown=comparison["citation_volume"]["by_platform"], gap_analysis=gap, opportunity_areas=opportunities, recommendations=recommendations, confidence=confidence, period_days=period_days, ) async def _get_competitors( self, db: AsyncSession, brand_id: uuid.UUID, ) -> list[Competitor]: stmt = select(Competitor).where(Competitor.brand_id == brand_id) result = await db.execute(stmt) return list(result.scalars().all()) async def _aggregate_citation_data( self, db: AsyncSession, brand_id: uuid.UUID, target_name: str, period_days: int = 30, ) -> dict: since = datetime.utcnow() - timedelta(days=period_days) query_stmt = select(Query).where(Query.brand_id == brand_id) query_result = await db.execute(query_stmt) queries = list(query_result.scalars().all()) if not queries: return { "citation_count": 0, "positive_ratio": 0.0, "avg_rank": 0.0, "by_platform": {}, "content_types": {}, "sentiment_breakdown": {"positive": 0, "neutral": 0, "negative": 0}, "total_records": 0, } query_ids = [q.id for q in queries] query_aliases = set() for q in queries: query_aliases.add(q.target_brand.lower()) if q.brand_aliases: for alias in q.brand_aliases: query_aliases.add(alias.lower()) conditions = [CitationRecord.query_id.in_(query_ids)] if since: conditions.append(CitationRecord.queried_at >= since) stmt = select(CitationRecord).where(and_(*conditions)) result = await db.execute(stmt) records = list(result.scalars().all()) target_lower = target_name.lower() matching_records = [] for record in records: if record.cited and record.competitor_brands: is_target = False for cb in record.competitor_brands: if isinstance(cb, str) and cb.lower() == target_lower: is_target = True break elif isinstance(cb, str) and cb.lower() in query_aliases: is_target = True break if is_target: matching_records.append(record) elif record.cited and not record.competitor_brands: matching_records.append(record) total_citations = len(matching_records) if total_citations == 0: return { "citation_count": 0, "positive_ratio": 0.0, "avg_rank": 0.0, "by_platform": {}, "content_types": {}, "sentiment_breakdown": {"positive": 0, "neutral": 0, "negative": 0}, "total_records": len(records), } sentiment_breakdown = {"positive": 0, "neutral": 0, "negative": 0} for r in matching_records: s = r.sentiment or "neutral" if s in sentiment_breakdown: sentiment_breakdown[s] += 1 else: sentiment_breakdown["neutral"] += 1 positive_count = sentiment_breakdown["positive"] positive_ratio = positive_count / total_citations if total_citations > 0 else 0.0 ranks = [ r.citation_position for r in matching_records if r.citation_position is not None and r.citation_position > 0 ] avg_rank = sum(ranks) / len(ranks) if ranks else 0.0 by_platform = defaultdict(lambda: {"citations": 0, "positive": 0, "ranks": []}) for r in matching_records: platform = r.platform by_platform[platform]["citations"] += 1 if r.sentiment == "positive": by_platform[platform]["positive"] += 1 if r.citation_position is not None and r.citation_position > 0: by_platform[platform]["ranks"].append(r.citation_position) platform_stats = {} for platform, data in by_platform.items(): platform_stats[platform] = { "citations": data["citations"], "positive_ratio": data["positive"] / data["citations"] if data["citations"] > 0 else 0.0, "avg_rank": sum(data["ranks"]) / len(data["ranks"]) if data["ranks"] else 0.0, } content_types = defaultdict(int) for r in matching_records: match_type = r.match_type or "unknown" content_types[match_type] += 1 return { "citation_count": total_citations, "positive_ratio": round(positive_ratio, 4), "avg_rank": round(avg_rank, 2), "by_platform": platform_stats, "content_types": dict(content_types), "sentiment_breakdown": sentiment_breakdown, "total_records": len(records), } def _compare_platform_citations( self, brand_data: dict, competitor_data: dict, ) -> dict: all_platforms = set(brand_data["by_platform"].keys()) | set(competitor_data["by_platform"].keys()) result = {} for platform in all_platforms: bp = brand_data["by_platform"].get(platform, {"citations": 0, "positive_ratio": 0.0, "avg_rank": 0.0}) cp = competitor_data["by_platform"].get(platform, {"citations": 0, "positive_ratio": 0.0, "avg_rank": 0.0}) result[platform] = { "brand": bp, "competitor": cp, } return result def _identify_gaps( self, comparison: dict, brand_name: str, competitor_name: str, ) -> dict: gaps = [] volume = comparison.get("citation_volume", {}) citation_diff = volume.get("diff", 0) if citation_diff < 0: gaps.append({ "dimension": "citation_count", "brand_value": volume.get("brand", 0), "competitor_value": volume.get("competitor", 0), "gap": abs(citation_diff), "severity": "high" if abs(citation_diff) >= 5 else "medium" if abs(citation_diff) >= 2 else "low", }) quality = comparison.get("quality", {}) sentiment = quality.get("sentiment", {}) positive_diff = sentiment.get("diff", 0) if positive_diff < -0.1: gaps.append({ "dimension": "positive_ratio", "brand_value": sentiment.get("brand_positive_ratio", 0), "competitor_value": sentiment.get("competitor_positive_ratio", 0), "gap": abs(positive_diff), "severity": "high" if abs(positive_diff) >= 0.3 else "medium" if abs(positive_diff) >= 0.15 else "low", }) ranking = quality.get("ranking", {}) rank_diff = ranking.get("diff", 0) if rank_diff > 1.0: gaps.append({ "dimension": "avg_rank", "brand_value": ranking.get("brand_avg_rank", 0), "competitor_value": ranking.get("competitor_avg_rank", 0), "gap": abs(rank_diff), "severity": "high" if abs(rank_diff) >= 3.0 else "medium" if abs(rank_diff) >= 2.0 else "low", }) for platform, data in volume.get("by_platform", {}).items(): brand_citations = data.get("brand", {}).get("citations", 0) competitor_citations = data.get("competitor", {}).get("citations", 0) if competitor_citations > brand_citations + 2: gaps.append({ "dimension": f"platform_{platform}", "brand_value": brand_citations, "competitor_value": competitor_citations, "gap": competitor_citations - brand_citations, "severity": "high" if (competitor_citations - brand_citations) >= 5 else "medium", }) return { "brand_name": brand_name, "competitor_name": competitor_name, "gaps": gaps, "total_gaps": len(gaps), "high_severity_count": sum(1 for g in gaps if g["severity"] == "high"), } async def _analyze_query_overlap( self, db: AsyncSession, brand_id: uuid.UUID, brand_name: str, competitor_name: str, period_days: int, ) -> dict: since = datetime.utcnow() - timedelta(days=period_days) stmt = select(Query).where( Query.brand_id == brand_id, Query.created_at >= since, ) result = await db.execute(stmt) queries = list(result.scalars().all()) brand_keywords = set() competitor_keywords = set() for q in queries: keyword = q.keyword.lower() brand_keywords.add(keyword) if competitor_name.lower() in keyword or any( a.lower() in keyword for a in (q.brand_aliases or []) ): competitor_keywords.add(keyword) overlap = brand_keywords & competitor_keywords brand_only = brand_keywords - competitor_keywords competitor_only = competitor_keywords - brand_keywords return { "brand_keyword_count": len(brand_keywords), "competitor_keyword_count": len(competitor_keywords), "overlap_count": len(overlap), "overlap_keywords": list(overlap)[:20], "brand_only_count": len(brand_only), "competitor_only_count": len(competitor_only), "overlap_ratio": round(len(overlap) / len(brand_keywords), 4) if brand_keywords else 0.0, } def _assess_data_sufficiency( self, brand_data: dict, competitor_data: dict, ) -> str: brand_count = brand_data["citation_count"] competitor_count = competitor_data["citation_count"] min_count = min(brand_count, competitor_count) if min_count > 10: return "sufficient" elif min_count >= 5: return "limited" else: return "insufficient" def _determine_confidence( self, brand_data: dict, competitor_data: dict, ) -> str: brand_count = brand_data["citation_count"] competitor_count = competitor_data["citation_count"] min_count = min(brand_count, competitor_count) if min_count > 20: return "high" elif min_count >= 5: return "medium" else: return "low" def _default_strategy(self, gaps: dict, opportunities: dict) -> dict: gap_strategies = [] for gap in gaps.get("gaps", []): gap_strategies.append({ "strategy": f"提升{gap['dimension']}维度表现,缩小与竞品差距", "priority": gap["severity"], "expected_impact": f"预计可将{gap['dimension']}差距缩小{gap['gap'] * 0.5:.1f}", }) diff_strategies = [] for opp in opportunities.get("opportunities", []): if opp["potential"] in ("high", "medium"): diff_strategies.append({ "strategy": opp["action"], "priority": opp["potential"], "expected_impact": "建立差异化竞争优势", }) return { "gap_closing_strategies": gap_strategies[:5], "differentiation_strategies": diff_strategies[:5], "quick_wins": [], "long_term_recommendations": [ { "recommendation": "持续监测竞品引用数据变化,定期更新策略", "rationale": "GEO优化是长期过程,需要持续迭代", } ], }