399 lines
14 KiB
Python
399 lines
14 KiB
Python
import copy
|
||
import json
|
||
import logging
|
||
import time
|
||
from datetime import datetime, timezone
|
||
|
||
from app.agent_framework.base import BaseAgent
|
||
from app.agent_framework.prompts.schema_advisor import SCHEMA_ADVISOR_TEMPLATE
|
||
from app.agent_framework.protocol import (
|
||
AgentCapability,
|
||
AgentType,
|
||
TaskMessage,
|
||
TaskResult,
|
||
TaskStatus,
|
||
)
|
||
from app.services.llm import LLMFactory, LLMError
|
||
from app.utils.json_extractor import extract_json
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
SCHEMA_TEMPLATES = {
|
||
"Organization": {
|
||
"@context": "https://schema.org",
|
||
"@type": "Organization",
|
||
"name": "",
|
||
"description": "",
|
||
"url": "",
|
||
"logo": "",
|
||
"sameAs": [],
|
||
"contactPoint": {
|
||
"@type": "ContactPoint",
|
||
"contactType": "customer service",
|
||
"telephone": "",
|
||
},
|
||
},
|
||
"Product": {
|
||
"@context": "https://schema.org",
|
||
"@type": "Product",
|
||
"name": "",
|
||
"description": "",
|
||
"brand": {"@type": "Brand", "name": ""},
|
||
"offers": {
|
||
"@type": "Offer",
|
||
"priceCurrency": "CNY",
|
||
"availability": "https://schema.org/InStock",
|
||
},
|
||
},
|
||
"FAQPage": {
|
||
"@context": "https://schema.org",
|
||
"@type": "FAQPage",
|
||
"mainEntity": [
|
||
{
|
||
"@type": "Question",
|
||
"name": "",
|
||
"acceptedAnswer": {
|
||
"@type": "Answer",
|
||
"text": "",
|
||
},
|
||
}
|
||
],
|
||
},
|
||
"Article": {
|
||
"@context": "https://schema.org",
|
||
"@type": "Article",
|
||
"headline": "",
|
||
"description": "",
|
||
"author": {"@type": "Organization", "name": ""},
|
||
"datePublished": "",
|
||
"image": "",
|
||
},
|
||
"LocalBusiness": {
|
||
"@context": "https://schema.org",
|
||
"@type": "LocalBusiness",
|
||
"name": "",
|
||
"address": {
|
||
"@type": "PostalAddress",
|
||
"streetAddress": "",
|
||
"addressLocality": "",
|
||
"addressRegion": "",
|
||
"postalCode": "",
|
||
"addressCountry": "CN",
|
||
},
|
||
"geo": {
|
||
"@type": "GeoCoordinates",
|
||
"latitude": "",
|
||
"longitude": "",
|
||
},
|
||
"telephone": "",
|
||
"openingHours": "",
|
||
},
|
||
}
|
||
|
||
DIMENSION_SCHEMA_MAP = {
|
||
"schema_marketing": ["Organization", "LocalBusiness"],
|
||
"entity_clarity": ["Organization", "Product"],
|
||
"citation_readiness": ["FAQPage", "Article"],
|
||
"brand_visibility": ["Organization", "Product"],
|
||
"local_seo": ["LocalBusiness"],
|
||
}
|
||
|
||
PRIORITY_THRESHOLD = {
|
||
"high": 30.0,
|
||
"medium": 60.0,
|
||
}
|
||
|
||
DIFFICULTY_MAP = {
|
||
"Organization": "easy",
|
||
"Product": "medium",
|
||
"FAQPage": "medium",
|
||
"Article": "easy",
|
||
"LocalBusiness": "hard",
|
||
}
|
||
|
||
|
||
class SchemaAdvisorAgent(BaseAgent):
|
||
|
||
def __init__(self):
|
||
super().__init__(
|
||
name="schema_advisor",
|
||
agent_type=AgentType.SCHEMA_ADVISOR,
|
||
version="1.0.0",
|
||
)
|
||
|
||
def get_capabilities(self) -> AgentCapability:
|
||
return AgentCapability(
|
||
agent_name=self.name,
|
||
agent_type=self.agent_type,
|
||
version=self.version,
|
||
supported_tasks=["schema_advise"],
|
||
max_concurrency=2,
|
||
description="Schema优化建议Agent:识别Schema缺失维度,生成JSON-LD结构化数据建议",
|
||
)
|
||
|
||
async def execute(self, task: TaskMessage) -> TaskResult:
|
||
started_at = datetime.now(timezone.utc)
|
||
start_time = time.monotonic()
|
||
|
||
try:
|
||
output = await self._advise(task)
|
||
|
||
elapsed = time.monotonic() - start_time
|
||
return TaskResult(
|
||
task_id=task.task_id,
|
||
agent_name=self.name,
|
||
status=TaskStatus.COMPLETED,
|
||
output_data=output,
|
||
error_message=None,
|
||
started_at=started_at,
|
||
completed_at=datetime.now(timezone.utc),
|
||
metrics={
|
||
"elapsed_seconds": round(elapsed, 2),
|
||
"task_type": task.task_type,
|
||
},
|
||
)
|
||
|
||
except LLMError as e:
|
||
elapsed = time.monotonic() - start_time
|
||
logger.error(f"SchemaAdvisor LLM error on task {task.task_id}: {e}")
|
||
return TaskResult(
|
||
task_id=task.task_id,
|
||
agent_name=self.name,
|
||
status=TaskStatus.FAILED,
|
||
output_data=None,
|
||
error_message=f"LLM调用失败: {e}",
|
||
started_at=started_at,
|
||
completed_at=datetime.now(timezone.utc),
|
||
metrics={
|
||
"elapsed_seconds": round(elapsed, 2),
|
||
"task_type": task.task_type,
|
||
},
|
||
)
|
||
|
||
except Exception as e:
|
||
elapsed = time.monotonic() - start_time
|
||
logger.error(f"SchemaAdvisor task {task.task_id} failed: {e}")
|
||
return TaskResult(
|
||
task_id=task.task_id,
|
||
agent_name=self.name,
|
||
status=TaskStatus.FAILED,
|
||
output_data=None,
|
||
error_message=str(e),
|
||
started_at=started_at,
|
||
completed_at=datetime.now(timezone.utc),
|
||
metrics={
|
||
"elapsed_seconds": round(elapsed, 2),
|
||
"task_type": task.task_type,
|
||
},
|
||
)
|
||
|
||
async def _advise(self, task: TaskMessage) -> dict:
|
||
input_data = task.input_data
|
||
brand_id = input_data.get("brand_id")
|
||
diagnosis_data = input_data.get("diagnosis_data", {})
|
||
brand_info = input_data.get("brand_info", {})
|
||
focus_dimensions = input_data.get("focus_dimensions")
|
||
|
||
if not brand_id:
|
||
raise ValueError("input_data必须包含'brand_id'字段")
|
||
|
||
await self.report_progress(
|
||
task_id=task.task_id,
|
||
progress=0.1,
|
||
message="开始Schema建议分析...",
|
||
)
|
||
|
||
missing_dimensions = self._identify_missing_dimensions(diagnosis_data, focus_dimensions)
|
||
|
||
await self.report_progress(
|
||
task_id=task.task_id,
|
||
progress=0.3,
|
||
message=f"识别到{len(missing_dimensions)}个Schema缺失维度...",
|
||
)
|
||
|
||
matched = self._match_templates(missing_dimensions)
|
||
|
||
await self.report_progress(
|
||
task_id=task.task_id,
|
||
progress=0.5,
|
||
message="匹配预定义模板完成,开始LLM填充...",
|
||
)
|
||
|
||
filled = await self._fill_with_llm(matched, brand_info)
|
||
|
||
await self.report_progress(
|
||
task_id=task.task_id,
|
||
progress=0.8,
|
||
message="LLM填充完成,验证JSON-LD格式...",
|
||
)
|
||
|
||
validated = self._validate_and_sort(filled)
|
||
|
||
await self.report_progress(
|
||
task_id=task.task_id,
|
||
progress=1.0,
|
||
message="Schema建议生成完成",
|
||
)
|
||
|
||
return {
|
||
"brand_id": brand_id,
|
||
"suggestions": validated,
|
||
"total": len(validated),
|
||
}
|
||
|
||
def _identify_missing_dimensions(
|
||
self,
|
||
diagnosis_data: dict,
|
||
focus_dimensions: list[str] | None = None,
|
||
) -> list[dict]:
|
||
dimensions = []
|
||
dimension_scores = diagnosis_data.get("dimensions", {})
|
||
for dim_name, dim_info in dimension_scores.items():
|
||
if dim_name not in DIMENSION_SCHEMA_MAP:
|
||
continue
|
||
if focus_dimensions and dim_name not in focus_dimensions:
|
||
continue
|
||
score = dim_info.get("score", 0) if isinstance(dim_info, dict) else dim_info
|
||
max_score = dim_info.get("max_score", 100) if isinstance(dim_info, dict) else 100
|
||
percentage = (score / max_score * 100) if max_score > 0 else 0
|
||
if percentage < 80:
|
||
dimensions.append({
|
||
"dimension": dim_name,
|
||
"current_score": round(score, 2),
|
||
"max_score": max_score,
|
||
"percentage": round(percentage, 2),
|
||
})
|
||
if not dimensions and diagnosis_data:
|
||
overall = diagnosis_data.get("overall_score", 0)
|
||
if overall < 80:
|
||
for dim_name in DIMENSION_SCHEMA_MAP:
|
||
if focus_dimensions and dim_name not in focus_dimensions:
|
||
continue
|
||
dimensions.append({
|
||
"dimension": dim_name,
|
||
"current_score": 0,
|
||
"max_score": 100,
|
||
"percentage": 0,
|
||
})
|
||
return dimensions
|
||
|
||
def _match_templates(self, missing_dimensions: list[dict]) -> list[dict]:
|
||
matched = []
|
||
seen_types = set()
|
||
for dim in missing_dimensions:
|
||
schema_types = DIMENSION_SCHEMA_MAP.get(dim["dimension"], [])
|
||
for schema_type in schema_types:
|
||
if schema_type in seen_types:
|
||
continue
|
||
seen_types.add(schema_type)
|
||
template = SCHEMA_TEMPLATES.get(schema_type)
|
||
if template:
|
||
percentage = dim["percentage"]
|
||
if percentage < PRIORITY_THRESHOLD["high"]:
|
||
priority = "high"
|
||
elif percentage < PRIORITY_THRESHOLD["medium"]:
|
||
priority = "medium"
|
||
else:
|
||
priority = "low"
|
||
matched.append({
|
||
"schema_type": schema_type,
|
||
"priority": priority,
|
||
"diagnosis_dimensions": {
|
||
"dimension": dim["dimension"],
|
||
"current_score": dim["current_score"],
|
||
"max_score": dim["max_score"],
|
||
"percentage": dim["percentage"],
|
||
},
|
||
"json_ld_template": copy.deepcopy(template),
|
||
"implementation_difficulty": DIFFICULTY_MAP.get(schema_type, "medium"),
|
||
})
|
||
return matched
|
||
|
||
async def _fill_with_llm(self, matched: list[dict], brand_info: dict) -> list[dict]:
|
||
provider = LLMFactory.get_default()
|
||
results = []
|
||
for item in matched:
|
||
schema_type = item["schema_type"]
|
||
try:
|
||
variables = {
|
||
"brand_name": brand_info.get("name", ""),
|
||
"brand_website": brand_info.get("website", ""),
|
||
"brand_industry": brand_info.get("industry", ""),
|
||
"schema_type": schema_type,
|
||
"diagnosis_data": json.dumps(item.get("diagnosis_dimensions", {}), ensure_ascii=False),
|
||
"existing_schemas": "无",
|
||
}
|
||
messages = SCHEMA_ADVISOR_TEMPLATE.render(variables)
|
||
response = await provider.chat(
|
||
messages,
|
||
temperature=0.3,
|
||
max_tokens=2048,
|
||
)
|
||
filled = json.loads(extract_json(response.content))
|
||
item["json_ld_filled"] = filled
|
||
item["estimated_impact"] = self._generate_impact_description(
|
||
schema_type, item.get("diagnosis_dimensions", {}).get("dimension", "")
|
||
)
|
||
except (json.JSONDecodeError, LLMError, ValueError) as e:
|
||
logger.warning(f"LLM填充Schema {schema_type} 失败: {e}")
|
||
item["json_ld_filled"] = None
|
||
item["estimated_impact"] = self._generate_impact_description(
|
||
schema_type, item.get("diagnosis_dimensions", {}).get("dimension", "")
|
||
)
|
||
results.append(item)
|
||
return results
|
||
|
||
def _validate_json_ld(self, json_ld: dict) -> dict:
|
||
errors = []
|
||
warnings = []
|
||
|
||
if not json_ld:
|
||
return {"is_valid": False, "errors": ["JSON-LD为空"], "warnings": []}
|
||
|
||
if "@context" not in json_ld:
|
||
errors.append("缺少@context字段")
|
||
|
||
if "@type" not in json_ld:
|
||
errors.append("缺少@type字段")
|
||
|
||
if "@context" in json_ld and json_ld["@context"] != "https://schema.org":
|
||
warnings.append(f"@context值非标准: {json_ld.get('@context')}")
|
||
|
||
if "@type" in json_ld and json_ld["@type"] not in SCHEMA_TEMPLATES:
|
||
warnings.append(f"@type非推荐类型: {json_ld.get('@type')}")
|
||
|
||
try:
|
||
json.dumps(json_ld)
|
||
except (json.JSONDecodeError, TypeError) as e:
|
||
errors.append(f"JSON序列化失败: {e}")
|
||
|
||
return {
|
||
"is_valid": len(errors) == 0,
|
||
"errors": errors,
|
||
"warnings": warnings,
|
||
}
|
||
|
||
def _validate_and_sort(self, items: list[dict]) -> list[dict]:
|
||
validated = []
|
||
for item in items:
|
||
json_ld_filled = item.get("json_ld_filled")
|
||
if json_ld_filled:
|
||
validation = self._validate_json_ld(json_ld_filled)
|
||
item["validation_errors"] = None if validation["is_valid"] else {"errors": validation["errors"], "warnings": validation["warnings"]}
|
||
else:
|
||
item["validation_errors"] = {"errors": ["JSON-LD填充失败"], "warnings": []}
|
||
validated.append(item)
|
||
priority_order = {"high": 0, "medium": 1, "low": 2}
|
||
validated.sort(key=lambda x: priority_order.get(x.get("priority", "medium"), 1))
|
||
return validated
|
||
|
||
def _generate_impact_description(self, schema_type: str, dimension: str) -> str:
|
||
impacts = {
|
||
"Organization": "增强品牌实体识别,提升AI搜索引擎对品牌的理解和引用概率",
|
||
"Product": "提升产品在搜索结果中的富摘要展示,增加点击率和引用率",
|
||
"FAQPage": "增加FAQ富摘要展示机会,提升在AI回答中的直接引用概率",
|
||
"Article": "优化文章内容的结构化表达,提升AI搜索引擎的内容理解和引用",
|
||
"LocalBusiness": "增强本地搜索可见性,提升地理位置相关查询的引用率",
|
||
}
|
||
return impacts.get(schema_type, f"提升{dimension}维度的得分和AI引用率")
|