620 lines
23 KiB
Python
620 lines
23 KiB
Python
from datetime import UTC, datetime
|
|
|
|
import pytest
|
|
|
|
from app.services.ai_engine.base import AIQueryResult, CitationInfo, EngineType
|
|
from app.services.citation.citation_pattern import (
|
|
AuthoritySignalAnalyzer,
|
|
CitationFormatAnalyzer,
|
|
CitationPattern,
|
|
CitationPatternEngine,
|
|
ContentStructureAnalyzer,
|
|
EnginePreferenceAnalyzer,
|
|
PatternAnalysisReport,
|
|
)
|
|
|
|
|
|
def _make_result(
|
|
engine_type: EngineType = EngineType.CHATGPT,
|
|
raw_response: str = "test response",
|
|
citations: list[CitationInfo] | None = None,
|
|
has_brand_citation: bool = False,
|
|
has_competitor_citation: bool = False,
|
|
brand_context: str | None = None,
|
|
competitor_contexts: list[str] | None = None,
|
|
metadata: dict | None = None,
|
|
) -> AIQueryResult:
|
|
return AIQueryResult(
|
|
engine_type=engine_type,
|
|
query="test query",
|
|
raw_response=raw_response,
|
|
citations=citations or [],
|
|
has_brand_citation=has_brand_citation,
|
|
has_competitor_citation=has_competitor_citation,
|
|
brand_context=brand_context,
|
|
competitor_contexts=competitor_contexts or [],
|
|
response_time_ms=1000,
|
|
timestamp=datetime(2025, 1, 1, tzinfo=UTC),
|
|
metadata=metadata or {},
|
|
)
|
|
|
|
|
|
class TestCitationPatternDataStructure:
|
|
def test_create_citation_pattern(self):
|
|
pattern = CitationPattern(
|
|
pattern_type="content_structure",
|
|
pattern_name="faq_format",
|
|
frequency=0.6,
|
|
confidence=0.8,
|
|
description="FAQ format detected in responses",
|
|
details={"count": 3, "total": 5},
|
|
)
|
|
assert pattern.pattern_type == "content_structure"
|
|
assert pattern.pattern_name == "faq_format"
|
|
assert pattern.frequency == 0.6
|
|
assert pattern.confidence == 0.8
|
|
assert pattern.details["count"] == 3
|
|
|
|
def test_frequency_range(self):
|
|
pattern = CitationPattern(
|
|
pattern_type="authority_signal",
|
|
pattern_name="data_citation",
|
|
frequency=0.0,
|
|
confidence=0.5,
|
|
description="",
|
|
details={},
|
|
)
|
|
assert 0.0 <= pattern.frequency <= 1.0
|
|
|
|
def test_confidence_range(self):
|
|
pattern = CitationPattern(
|
|
pattern_type="citation_format",
|
|
pattern_name="direct_citation",
|
|
frequency=0.5,
|
|
confidence=1.0,
|
|
description="",
|
|
details={},
|
|
)
|
|
assert 0.0 <= pattern.confidence <= 1.0
|
|
|
|
def test_pattern_types(self):
|
|
valid_types = {"content_structure", "authority_signal", "citation_format", "engine_preference"}
|
|
for pt in valid_types:
|
|
pattern = CitationPattern(
|
|
pattern_type=pt,
|
|
pattern_name="test",
|
|
frequency=0.5,
|
|
confidence=0.5,
|
|
description="",
|
|
details={},
|
|
)
|
|
assert pattern.pattern_type in valid_types
|
|
|
|
|
|
class TestPatternAnalysisReportDataStructure:
|
|
def test_create_report(self):
|
|
report = PatternAnalysisReport(
|
|
brand_id="brand-123",
|
|
query="test query",
|
|
total_results=5,
|
|
patterns=[],
|
|
content_structure_insights={},
|
|
authority_signal_insights={},
|
|
citation_format_insights={},
|
|
engine_preferences={},
|
|
recommendations=[],
|
|
)
|
|
assert report.brand_id == "brand-123"
|
|
assert report.total_results == 5
|
|
assert report.patterns == []
|
|
assert report.recommendations == []
|
|
|
|
def test_report_with_patterns(self):
|
|
patterns = [
|
|
CitationPattern(
|
|
pattern_type="content_structure",
|
|
pattern_name="faq_format",
|
|
frequency=0.7,
|
|
confidence=0.9,
|
|
description="FAQ detected",
|
|
details={"count": 7},
|
|
)
|
|
]
|
|
report = PatternAnalysisReport(
|
|
brand_id="b1",
|
|
query="q",
|
|
total_results=10,
|
|
patterns=patterns,
|
|
content_structure_insights={"faq_frequency": 0.7},
|
|
authority_signal_insights={},
|
|
citation_format_insights={},
|
|
engine_preferences={},
|
|
recommendations=["Add FAQ sections"],
|
|
)
|
|
assert len(report.patterns) == 1
|
|
assert report.content_structure_insights["faq_frequency"] == 0.7
|
|
assert len(report.recommendations) == 1
|
|
|
|
|
|
class TestContentStructureAnalyzer:
|
|
@pytest.fixture
|
|
def analyzer(self):
|
|
return ContentStructureAnalyzer()
|
|
|
|
def test_faq_format_detection(self, analyzer):
|
|
faq_response = """
|
|
Q: What is GEO?
|
|
A: GEO stands for Generative Engine Optimization.
|
|
|
|
Q: How does GEO work?
|
|
A: GEO works by optimizing content for AI engines.
|
|
|
|
常见问题:
|
|
问题: 什么是SEO?
|
|
回答: SEO是搜索引擎优化。
|
|
"""
|
|
results = [_make_result(raw_response=faq_response)]
|
|
patterns = analyzer.analyze(results)
|
|
faq_pattern = [p for p in patterns if p.pattern_name == "faq_format"]
|
|
assert len(faq_pattern) == 1
|
|
assert faq_pattern[0].frequency > 0
|
|
assert faq_pattern[0].pattern_type == "content_structure"
|
|
|
|
def test_list_format_detection(self, analyzer):
|
|
list_response = """
|
|
Here are the top features:
|
|
1. Fast performance
|
|
2. Easy to use
|
|
3. Affordable price
|
|
|
|
- Benefit A
|
|
- Benefit B
|
|
- Benefit C
|
|
"""
|
|
results = [_make_result(raw_response=list_response)]
|
|
patterns = analyzer.analyze(results)
|
|
list_pattern = [p for p in patterns if p.pattern_name == "list_format"]
|
|
assert len(list_pattern) == 1
|
|
assert list_pattern[0].frequency > 0
|
|
|
|
def test_table_format_detection(self, analyzer):
|
|
table_response = """
|
|
| Feature | Plan A | Plan B |
|
|
|---------|--------|--------|
|
|
| Price | $10 | $20 |
|
|
|
|
Comparison table:
|
|
Item Value
|
|
---- -----
|
|
A 100
|
|
B 200
|
|
"""
|
|
results = [_make_result(raw_response=table_response)]
|
|
patterns = analyzer.analyze(results)
|
|
table_pattern = [p for p in patterns if p.pattern_name == "table_format"]
|
|
assert len(table_pattern) == 1
|
|
assert table_pattern[0].frequency > 0
|
|
|
|
def test_quote_block_detection(self, analyzer):
|
|
quote_response = """
|
|
According to the expert:
|
|
> "This is the best solution on the market."
|
|
|
|
As stated in the report:
|
|
"The company leads in innovation."
|
|
"""
|
|
results = [_make_result(raw_response=quote_response)]
|
|
patterns = analyzer.analyze(results)
|
|
quote_pattern = [p for p in patterns if p.pattern_name == "quote_block"]
|
|
assert len(quote_pattern) == 1
|
|
assert quote_pattern[0].frequency > 0
|
|
|
|
def test_no_structure_detected(self, analyzer):
|
|
plain_response = "This is a plain text response without any special formatting."
|
|
results = [_make_result(raw_response=plain_response)]
|
|
patterns = analyzer.analyze(results)
|
|
for p in patterns:
|
|
assert p.frequency == 0.0
|
|
|
|
def test_multiple_results_aggregation(self, analyzer):
|
|
results = [
|
|
_make_result(raw_response="Q: What? A: Something."),
|
|
_make_result(raw_response="Plain text without structure."),
|
|
_make_result(raw_response="1. Item one\n2. Item two"),
|
|
]
|
|
patterns = analyzer.analyze(results)
|
|
faq_pattern = [p for p in patterns if p.pattern_name == "faq_format"][0]
|
|
list_pattern = [p for p in patterns if p.pattern_name == "list_format"][0]
|
|
assert faq_pattern.frequency == pytest.approx(1 / 3, rel=0.01)
|
|
assert list_pattern.frequency == pytest.approx(1 / 3, rel=0.01)
|
|
|
|
|
|
class TestAuthoritySignalAnalyzer:
|
|
@pytest.fixture
|
|
def analyzer(self):
|
|
return AuthoritySignalAnalyzer()
|
|
|
|
def test_data_citation_detection(self, analyzer):
|
|
data_response = """
|
|
According to a 2024 study by MIT, 78% of companies adopted AI.
|
|
Research from Stanford University shows that productivity increased by 45%.
|
|
Data from the World Health Organization indicates a 30% reduction.
|
|
Statistics show that 90% of users prefer this approach.
|
|
"""
|
|
results = [_make_result(raw_response=data_response)]
|
|
patterns = analyzer.analyze(results)
|
|
data_pattern = [p for p in patterns if p.pattern_name == "data_citation"]
|
|
assert len(data_pattern) == 1
|
|
assert data_pattern[0].frequency > 0
|
|
assert data_pattern[0].details["match_count"] > 0
|
|
|
|
def test_expert_citation_detection(self, analyzer):
|
|
expert_response = """
|
|
Dr. Smith from Harvard notes that this trend will continue.
|
|
Professor Johnson at Stanford recommends this approach.
|
|
Expert analyst Jane Doe suggests using this method.
|
|
According to industry expert John, the market will grow.
|
|
"""
|
|
results = [_make_result(raw_response=expert_response)]
|
|
patterns = analyzer.analyze(results)
|
|
expert_pattern = [p for p in patterns if p.pattern_name == "expert_citation"]
|
|
assert len(expert_pattern) == 1
|
|
assert expert_pattern[0].frequency > 0
|
|
|
|
def test_certification_mark_detection(self, analyzer):
|
|
cert_response = """
|
|
The product is ISO 9001 certified and FDA approved.
|
|
It has received CE certification and UL listed status.
|
|
SOC 2 Type II compliant and HIPAA compliant.
|
|
"""
|
|
results = [_make_result(raw_response=cert_response)]
|
|
patterns = analyzer.analyze(results)
|
|
cert_pattern = [p for p in patterns if p.pattern_name == "certification_mark"]
|
|
assert len(cert_pattern) == 1
|
|
assert cert_pattern[0].frequency > 0
|
|
|
|
def test_no_authority_signals(self, analyzer):
|
|
plain_response = "This is a basic response without any authority signals."
|
|
results = [_make_result(raw_response=plain_response)]
|
|
patterns = analyzer.analyze(results)
|
|
for p in patterns:
|
|
assert p.frequency == 0.0
|
|
|
|
def test_multiple_results_aggregation(self, analyzer):
|
|
results = [
|
|
_make_result(raw_response="According to a 2024 study, 80% agree."),
|
|
_make_result(raw_response="No authority signals here."),
|
|
_make_result(raw_response="Dr. Lee from Oxford confirms the findings."),
|
|
]
|
|
patterns = analyzer.analyze(results)
|
|
data_pattern = [p for p in patterns if p.pattern_name == "data_citation"][0]
|
|
expert_pattern = [p for p in patterns if p.pattern_name == "expert_citation"][0]
|
|
assert data_pattern.frequency == pytest.approx(1 / 3, rel=0.01)
|
|
assert expert_pattern.frequency == pytest.approx(1 / 3, rel=0.01)
|
|
|
|
|
|
class TestCitationFormatAnalyzer:
|
|
@pytest.fixture
|
|
def analyzer(self):
|
|
return CitationFormatAnalyzer()
|
|
|
|
def test_direct_citation_detection(self, analyzer):
|
|
direct_response = """
|
|
According to BrandX, their product is the best in class.
|
|
BrandX states that they have over 1 million users.
|
|
BrandX claims to be the industry leader.
|
|
"""
|
|
results = [
|
|
_make_result(
|
|
raw_response=direct_response,
|
|
has_brand_citation=True,
|
|
brand_context="BrandX states that they have over 1 million users",
|
|
)
|
|
]
|
|
patterns = analyzer.analyze(results)
|
|
direct_pattern = [p for p in patterns if p.pattern_name == "direct_citation"]
|
|
assert len(direct_pattern) == 1
|
|
assert direct_pattern[0].frequency > 0
|
|
|
|
def test_indirect_citation_detection(self, analyzer):
|
|
indirect_response = """
|
|
Some leading solutions in this space include comprehensive platforms
|
|
that offer multiple features. One such platform provides AI-powered
|
|
analytics and real-time monitoring capabilities.
|
|
"""
|
|
results = [
|
|
_make_result(
|
|
raw_response=indirect_response,
|
|
has_brand_citation=True,
|
|
brand_context="One such platform provides AI-powered analytics",
|
|
)
|
|
]
|
|
patterns = analyzer.analyze(results)
|
|
indirect_pattern = [p for p in patterns if p.pattern_name == "indirect_citation"]
|
|
assert len(indirect_pattern) == 1
|
|
|
|
def test_comparison_citation_detection(self, analyzer):
|
|
comparison_response = """
|
|
Compared to BrandX, CompetitorY offers better pricing.
|
|
While BrandX focuses on enterprise, CompetitorY targets SMBs.
|
|
BrandX vs CompetitorY: BrandX has more features but CompetitorY is cheaper.
|
|
"""
|
|
results = [
|
|
_make_result(
|
|
raw_response=comparison_response,
|
|
has_brand_citation=True,
|
|
has_competitor_citation=True,
|
|
brand_context="Compared to BrandX",
|
|
competitor_contexts=["CompetitorY offers better pricing"],
|
|
)
|
|
]
|
|
patterns = analyzer.analyze(results)
|
|
comparison_pattern = [p for p in patterns if p.pattern_name == "comparison_citation"]
|
|
assert len(comparison_pattern) == 1
|
|
assert comparison_pattern[0].frequency > 0
|
|
|
|
def test_no_citation_format(self, analyzer):
|
|
results = [
|
|
_make_result(
|
|
raw_response="Generic response without citations.",
|
|
has_brand_citation=False,
|
|
)
|
|
]
|
|
patterns = analyzer.analyze(results)
|
|
for p in patterns:
|
|
assert p.frequency == 0.0
|
|
|
|
def test_citation_with_position_info(self, analyzer):
|
|
results = [
|
|
_make_result(
|
|
raw_response="BrandX is mentioned here.",
|
|
has_brand_citation=True,
|
|
brand_context="BrandX is mentioned here",
|
|
citations=[
|
|
CitationInfo(
|
|
source_url="https://example.com",
|
|
source_title="Example",
|
|
citation_context="BrandX is mentioned here",
|
|
confidence=0.9,
|
|
position=0,
|
|
)
|
|
],
|
|
)
|
|
]
|
|
patterns = analyzer.analyze(results)
|
|
direct_pattern = [p for p in patterns if p.pattern_name == "direct_citation"][0]
|
|
assert direct_pattern.frequency > 0
|
|
|
|
|
|
class TestEnginePreferenceAnalyzer:
|
|
@pytest.fixture
|
|
def analyzer(self):
|
|
return EnginePreferenceAnalyzer()
|
|
|
|
def test_single_engine_citation_rate(self, analyzer):
|
|
results = [
|
|
_make_result(engine_type=EngineType.CHATGPT, has_brand_citation=True),
|
|
_make_result(engine_type=EngineType.CHATGPT, has_brand_citation=False),
|
|
]
|
|
prefs = analyzer.analyze(results)
|
|
assert "chatgpt" in prefs
|
|
assert prefs["chatgpt"]["citation_rate"] == 0.5
|
|
|
|
def test_multi_engine_preferences(self, analyzer):
|
|
results = [
|
|
_make_result(engine_type=EngineType.CHATGPT, has_brand_citation=True),
|
|
_make_result(engine_type=EngineType.PERPLEXITY, has_brand_citation=True),
|
|
_make_result(engine_type=EngineType.KIMI, has_brand_citation=False),
|
|
_make_result(engine_type=EngineType.DEEPSEEK, has_brand_citation=True),
|
|
]
|
|
prefs = analyzer.analyze(results)
|
|
assert len(prefs) == 4
|
|
for engine_name, engine_data in prefs.items():
|
|
assert "citation_rate" in engine_data
|
|
assert "avg_citation_position" in engine_data
|
|
assert "format_preferences" in engine_data
|
|
|
|
def test_citation_position_preference(self, analyzer):
|
|
results = [
|
|
_make_result(
|
|
engine_type=EngineType.CHATGPT,
|
|
has_brand_citation=True,
|
|
citations=[
|
|
CitationInfo(
|
|
source_url="https://example.com",
|
|
source_title="Example",
|
|
citation_context="test",
|
|
confidence=0.9,
|
|
position=0,
|
|
)
|
|
],
|
|
),
|
|
_make_result(
|
|
engine_type=EngineType.PERPLEXITY,
|
|
has_brand_citation=True,
|
|
citations=[
|
|
CitationInfo(
|
|
source_url="https://example.com",
|
|
source_title="Example",
|
|
citation_context="test",
|
|
confidence=0.9,
|
|
position=5,
|
|
)
|
|
],
|
|
),
|
|
]
|
|
prefs = analyzer.analyze(results)
|
|
assert prefs["chatgpt"]["avg_citation_position"] < prefs["perplexity"]["avg_citation_position"]
|
|
|
|
def test_format_preferences(self, analyzer):
|
|
results = [
|
|
_make_result(
|
|
engine_type=EngineType.CHATGPT,
|
|
raw_response="Q: What? A: Something.\n1. First item\n2. Second item",
|
|
),
|
|
_make_result(
|
|
engine_type=EngineType.PERPLEXITY,
|
|
raw_response="Plain text without structure.",
|
|
),
|
|
]
|
|
prefs = analyzer.analyze(results)
|
|
assert "faq" in prefs["chatgpt"]["format_preferences"]
|
|
assert "list" in prefs["chatgpt"]["format_preferences"]
|
|
|
|
def test_empty_results(self, analyzer):
|
|
prefs = analyzer.analyze([])
|
|
assert prefs == {}
|
|
|
|
|
|
class TestCitationPatternEngine:
|
|
@pytest.fixture
|
|
def engine(self):
|
|
return CitationPatternEngine()
|
|
|
|
def test_full_analysis_flow(self, engine):
|
|
results = [
|
|
_make_result(
|
|
engine_type=EngineType.CHATGPT,
|
|
raw_response="Q: What is GEO? A: GEO is optimization for AI.\n1. First benefit\n2. Second benefit",
|
|
has_brand_citation=True,
|
|
brand_context="BrandX is mentioned",
|
|
citations=[
|
|
CitationInfo(
|
|
source_url="https://example.com",
|
|
source_title="Example",
|
|
citation_context="BrandX is mentioned",
|
|
confidence=0.9,
|
|
position=0,
|
|
)
|
|
],
|
|
),
|
|
_make_result(
|
|
engine_type=EngineType.PERPLEXITY,
|
|
raw_response="According to a 2024 study, 80% of companies use AI. Dr. Smith recommends BrandX.",
|
|
has_brand_citation=True,
|
|
brand_context="Dr. Smith recommends BrandX",
|
|
citations=[
|
|
CitationInfo(
|
|
source_url="https://example.com",
|
|
source_title="Example",
|
|
citation_context="Dr. Smith recommends BrandX",
|
|
confidence=0.8,
|
|
position=2,
|
|
)
|
|
],
|
|
),
|
|
]
|
|
report = engine.analyze(results, brand_id="brand-123", query="what is geo")
|
|
assert isinstance(report, PatternAnalysisReport)
|
|
assert report.brand_id == "brand-123"
|
|
assert report.query == "what is geo"
|
|
assert report.total_results == 2
|
|
assert len(report.patterns) > 0
|
|
assert isinstance(report.content_structure_insights, dict)
|
|
assert isinstance(report.authority_signal_insights, dict)
|
|
assert isinstance(report.citation_format_insights, dict)
|
|
assert isinstance(report.engine_preferences, dict)
|
|
assert isinstance(report.recommendations, list)
|
|
|
|
def test_empty_input_handling(self, engine):
|
|
report = engine.analyze([], brand_id="brand-1", query="test")
|
|
assert report.total_results == 0
|
|
assert report.patterns == []
|
|
assert report.content_structure_insights == {}
|
|
assert report.authority_signal_insights == {}
|
|
assert report.citation_format_insights == {}
|
|
assert report.engine_preferences == {}
|
|
assert report.recommendations == []
|
|
|
|
def test_single_engine_result(self, engine):
|
|
results = [
|
|
_make_result(
|
|
engine_type=EngineType.CHATGPT,
|
|
raw_response="Q: What? A: This.",
|
|
has_brand_citation=True,
|
|
brand_context="BrandX is great",
|
|
)
|
|
]
|
|
report = engine.analyze(results, brand_id="b1", query="q")
|
|
assert report.total_results == 1
|
|
assert "chatgpt" in report.engine_preferences
|
|
assert len(report.patterns) > 0
|
|
|
|
def test_multi_engine_aggregation(self, engine):
|
|
results = [
|
|
_make_result(
|
|
engine_type=EngineType.CHATGPT,
|
|
raw_response="Q: What? A: This. BrandX is the best.",
|
|
has_brand_citation=True,
|
|
brand_context="BrandX is the best",
|
|
),
|
|
_make_result(
|
|
engine_type=EngineType.PERPLEXITY,
|
|
raw_response="1. First point\n2. Second point\nBrandX offers great value.",
|
|
has_brand_citation=True,
|
|
brand_context="BrandX offers great value",
|
|
),
|
|
_make_result(
|
|
engine_type=EngineType.KIMI,
|
|
raw_response="Plain response. No citations here.",
|
|
has_brand_citation=False,
|
|
),
|
|
]
|
|
report = engine.analyze(results, brand_id="b1", query="q")
|
|
assert report.total_results == 3
|
|
assert len(report.engine_preferences) == 3
|
|
chatgpt_rate = report.engine_preferences["chatgpt"]["citation_rate"]
|
|
kimi_rate = report.engine_preferences["kimi"]["citation_rate"]
|
|
assert chatgpt_rate == 1.0
|
|
assert kimi_rate == 0.0
|
|
|
|
def test_pattern_report_generation(self, engine):
|
|
results = [
|
|
_make_result(
|
|
engine_type=EngineType.CHATGPT,
|
|
raw_response="Q: What is GEO? A: GEO optimization.\nAccording to 2024 research, 75% agree.",
|
|
has_brand_citation=True,
|
|
brand_context="BrandX provides GEO",
|
|
citations=[
|
|
CitationInfo(
|
|
source_url="https://brandx.com",
|
|
source_title="BrandX",
|
|
citation_context="BrandX provides GEO",
|
|
confidence=0.95,
|
|
position=0,
|
|
)
|
|
],
|
|
),
|
|
]
|
|
report = engine.analyze(results, brand_id="b1", query="geo optimization")
|
|
assert len(report.patterns) > 0
|
|
pattern_types = {p.pattern_type for p in report.patterns}
|
|
assert "content_structure" in pattern_types
|
|
assert "authority_signal" in pattern_types
|
|
assert "citation_format" in pattern_types
|
|
|
|
def test_recommendations_generated(self, engine):
|
|
results = [
|
|
_make_result(
|
|
engine_type=EngineType.CHATGPT,
|
|
raw_response="Q: What? A: Something.",
|
|
has_brand_citation=False,
|
|
),
|
|
]
|
|
report = engine.analyze(results, brand_id="b1", query="q")
|
|
assert isinstance(report.recommendations, list)
|
|
|
|
def test_insights_populated(self, engine):
|
|
results = [
|
|
_make_result(
|
|
engine_type=EngineType.CHATGPT,
|
|
raw_response="Q: What? A: This.\n1. Item\nISO 9001 certified.",
|
|
has_brand_citation=True,
|
|
brand_context="BrandX is here",
|
|
),
|
|
]
|
|
report = engine.analyze(results, brand_id="b1", query="q")
|
|
assert len(report.content_structure_insights) > 0
|
|
assert len(report.authority_signal_insights) > 0
|
|
assert len(report.citation_format_insights) > 0
|