geo/backend/app/agent_framework/agents/schema_advisor.py

import copy
import json
import logging
import time
from datetime import datetime, timezone

from app.agent_framework.base import BaseAgent
from app.agent_framework.prompts.schema_advisor import SCHEMA_ADVISOR_TEMPLATE
from app.agent_framework.protocol import (
    AgentCapability,
    AgentType,
    TaskMessage,
    TaskResult,
    TaskStatus,
)
from app.services.llm import LLMFactory, LLMError
from app.utils.json_extractor import extract_json

logger = logging.getLogger(__name__)

SCHEMA_TEMPLATES = {
    "Organization": {
        "@context": "https://schema.org",
        "@type": "Organization",
        "name": "",
        "description": "",
        "url": "",
        "logo": "",
        "sameAs": [],
        "contactPoint": {
            "@type": "ContactPoint",
            "contactType": "customer service",
            "telephone": "",
        },
    },
    "Product": {
        "@context": "https://schema.org",
        "@type": "Product",
        "name": "",
        "description": "",
        "brand": {"@type": "Brand", "name": ""},
        "offers": {
            "@type": "Offer",
            "priceCurrency": "CNY",
            "availability": "https://schema.org/InStock",
        },
    },
    "FAQPage": {
        "@context": "https://schema.org",
        "@type": "FAQPage",
        "mainEntity": [
            {
                "@type": "Question",
                "name": "",
                "acceptedAnswer": {
                    "@type": "Answer",
                    "text": "",
                },
            }
        ],
    },
    "Article": {
        "@context": "https://schema.org",
        "@type": "Article",
        "headline": "",
        "description": "",
        "author": {"@type": "Organization", "name": ""},
        "datePublished": "",
        "image": "",
    },
    "LocalBusiness": {
        "@context": "https://schema.org",
        "@type": "LocalBusiness",
        "name": "",
        "address": {
            "@type": "PostalAddress",
            "streetAddress": "",
            "addressLocality": "",
            "addressRegion": "",
            "postalCode": "",
            "addressCountry": "CN",
        },
        "geo": {
            "@type": "GeoCoordinates",
            "latitude": "",
            "longitude": "",
        },
        "telephone": "",
        "openingHours": "",
    },
}

DIMENSION_SCHEMA_MAP = {
    "schema_marketing": ["Organization", "LocalBusiness"],
    "entity_clarity": ["Organization", "Product"],
    "citation_readiness": ["FAQPage", "Article"],
    "brand_visibility": ["Organization", "Product"],
    "local_seo": ["LocalBusiness"],
}

PRIORITY_THRESHOLD = {
    "high": 30.0,
    "medium": 60.0,
}

DIFFICULTY_MAP = {
    "Organization": "easy",
    "Product": "medium",
    "FAQPage": "medium",
    "Article": "easy",
    "LocalBusiness": "hard",
}


class SchemaAdvisorAgent(BaseAgent):

    def __init__(self):
        super().__init__(
            name="schema_advisor",
            agent_type=AgentType.SCHEMA_ADVISOR,
            version="1.0.0",
        )

    def get_capabilities(self) -> AgentCapability:
        return AgentCapability(
            agent_name=self.name,
            agent_type=self.agent_type,
            version=self.version,
            supported_tasks=["schema_advise"],
            max_concurrency=2,
            description="Schema优化建议Agent：识别Schema缺失维度，生成JSON-LD结构化数据建议",
        )

    async def execute(self, task: TaskMessage) -> TaskResult:
        started_at = datetime.now(timezone.utc)
        start_time = time.monotonic()

        try:
            output = await self._advise(task)

            elapsed = time.monotonic() - start_time
            return TaskResult(
                task_id=task.task_id,
                agent_name=self.name,
                status=TaskStatus.COMPLETED,
                output_data=output,
                error_message=None,
                started_at=started_at,
                completed_at=datetime.now(timezone.utc),
                metrics={
                    "elapsed_seconds": round(elapsed, 2),
                    "task_type": task.task_type,
                },
            )

        except LLMError as e:
            elapsed = time.monotonic() - start_time
            logger.error(f"SchemaAdvisor LLM error on task {task.task_id}: {e}")
            return TaskResult(
                task_id=task.task_id,
                agent_name=self.name,
                status=TaskStatus.FAILED,
                output_data=None,
                error_message=f"LLM调用失败: {e}",
                started_at=started_at,
                completed_at=datetime.now(timezone.utc),
                metrics={
                    "elapsed_seconds": round(elapsed, 2),
                    "task_type": task.task_type,
                },
            )

        except Exception as e:
            elapsed = time.monotonic() - start_time
            logger.error(f"SchemaAdvisor task {task.task_id} failed: {e}")
            return TaskResult(
                task_id=task.task_id,
                agent_name=self.name,
                status=TaskStatus.FAILED,
                output_data=None,
                error_message=str(e),
                started_at=started_at,
                completed_at=datetime.now(timezone.utc),
                metrics={
                    "elapsed_seconds": round(elapsed, 2),
                    "task_type": task.task_type,
                },
            )

    async def _advise(self, task: TaskMessage) -> dict:
        input_data = task.input_data
        brand_id = input_data.get("brand_id")
        diagnosis_data = input_data.get("diagnosis_data", {})
        brand_info = input_data.get("brand_info", {})
        focus_dimensions = input_data.get("focus_dimensions")

        if not brand_id:
            raise ValueError("input_data必须包含'brand_id'字段")

        await self.report_progress(
            task_id=task.task_id,
            progress=0.1,
            message="开始Schema建议分析...",
        )

        missing_dimensions = self._identify_missing_dimensions(diagnosis_data, focus_dimensions)

        await self.report_progress(
            task_id=task.task_id,
            progress=0.3,
            message=f"识别到{len(missing_dimensions)}个Schema缺失维度...",
        )

        matched = self._match_templates(missing_dimensions)

        await self.report_progress(
            task_id=task.task_id,
            progress=0.5,
            message="匹配预定义模板完成，开始LLM填充...",
        )

        filled = await self._fill_with_llm(matched, brand_info)

        await self.report_progress(
            task_id=task.task_id,
            progress=0.8,
            message="LLM填充完成，验证JSON-LD格式...",
        )

        validated = self._validate_and_sort(filled)

        await self.report_progress(
            task_id=task.task_id,
            progress=1.0,
            message="Schema建议生成完成",
        )

        return {
            "brand_id": brand_id,
            "suggestions": validated,
            "total": len(validated),
        }

    def _identify_missing_dimensions(
        self,
        diagnosis_data: dict,
        focus_dimensions: list[str] | None = None,
    ) -> list[dict]:
        dimensions = []
        dimension_scores = diagnosis_data.get("dimensions", {})
        for dim_name, dim_info in dimension_scores.items():
            if dim_name not in DIMENSION_SCHEMA_MAP:
                continue
            if focus_dimensions and dim_name not in focus_dimensions:
                continue
            score = dim_info.get("score", 0) if isinstance(dim_info, dict) else dim_info
            max_score = dim_info.get("max_score", 100) if isinstance(dim_info, dict) else 100
            percentage = (score / max_score * 100) if max_score > 0 else 0
            if percentage < 80:
                dimensions.append({
                    "dimension": dim_name,
                    "current_score": round(score, 2),
                    "max_score": max_score,
                    "percentage": round(percentage, 2),
                })
        if not dimensions and diagnosis_data:
            overall = diagnosis_data.get("overall_score", 0)
            if overall < 80:
                for dim_name in DIMENSION_SCHEMA_MAP:
                    if focus_dimensions and dim_name not in focus_dimensions:
                        continue
                    dimensions.append({
                        "dimension": dim_name,
                        "current_score": 0,
                        "max_score": 100,
                        "percentage": 0,
                    })
        return dimensions

    def _match_templates(self, missing_dimensions: list[dict]) -> list[dict]:
        matched = []
        seen_types = set()
        for dim in missing_dimensions:
            schema_types = DIMENSION_SCHEMA_MAP.get(dim["dimension"], [])
            for schema_type in schema_types:
                if schema_type in seen_types:
                    continue
                seen_types.add(schema_type)
                template = SCHEMA_TEMPLATES.get(schema_type)
                if template:
                    percentage = dim["percentage"]
                    if percentage < PRIORITY_THRESHOLD["high"]:
                        priority = "high"
                    elif percentage < PRIORITY_THRESHOLD["medium"]:
                        priority = "medium"
                    else:
                        priority = "low"
                    matched.append({
                        "schema_type": schema_type,
                        "priority": priority,
                        "diagnosis_dimensions": {
                            "dimension": dim["dimension"],
                            "current_score": dim["current_score"],
                            "max_score": dim["max_score"],
                            "percentage": dim["percentage"],
                        },
                        "json_ld_template": copy.deepcopy(template),
                        "implementation_difficulty": DIFFICULTY_MAP.get(schema_type, "medium"),
                    })
        return matched

    async def _fill_with_llm(self, matched: list[dict], brand_info: dict) -> list[dict]:
        provider = LLMFactory.get_default()
        results = []
        for item in matched:
            schema_type = item["schema_type"]
            try:
                variables = {
                    "brand_name": brand_info.get("name", ""),
                    "brand_website": brand_info.get("website", ""),
                    "brand_industry": brand_info.get("industry", ""),
                    "schema_type": schema_type,
                    "diagnosis_data": json.dumps(item.get("diagnosis_dimensions", {}), ensure_ascii=False),
                    "existing_schemas": "无",
                }
                messages = SCHEMA_ADVISOR_TEMPLATE.render(variables)
                response = await provider.chat(
                    messages,
                    temperature=0.3,
                    max_tokens=2048,
                )
                filled = json.loads(extract_json(response.content))
                item["json_ld_filled"] = filled
                item["estimated_impact"] = self._generate_impact_description(
                    schema_type, item.get("diagnosis_dimensions", {}).get("dimension", "")
                )
            except (json.JSONDecodeError, LLMError, ValueError) as e:
                logger.warning(f"LLM填充Schema {schema_type} 失败: {e}")
                item["json_ld_filled"] = None
                item["estimated_impact"] = self._generate_impact_description(
                    schema_type, item.get("diagnosis_dimensions", {}).get("dimension", "")
                )
            results.append(item)
        return results

    def _validate_json_ld(self, json_ld: dict) -> dict:
        errors = []
        warnings = []

        if not json_ld:
            return {"is_valid": False, "errors": ["JSON-LD为空"], "warnings": []}

        if "@context" not in json_ld:
            errors.append("缺少@context字段")

        if "@type" not in json_ld:
            errors.append("缺少@type字段")

        if "@context" in json_ld and json_ld["@context"] != "https://schema.org":
            warnings.append(f"@context值非标准: {json_ld.get('@context')}")

        if "@type" in json_ld and json_ld["@type"] not in SCHEMA_TEMPLATES:
            warnings.append(f"@type非推荐类型: {json_ld.get('@type')}")

        try:
            json.dumps(json_ld)
        except (json.JSONDecodeError, TypeError) as e:
            errors.append(f"JSON序列化失败: {e}")

        return {
            "is_valid": len(errors) == 0,
            "errors": errors,
            "warnings": warnings,
        }

    def _validate_and_sort(self, items: list[dict]) -> list[dict]:
        validated = []
        for item in items:
            json_ld_filled = item.get("json_ld_filled")
            if json_ld_filled:
                validation = self._validate_json_ld(json_ld_filled)
                item["validation_errors"] = None if validation["is_valid"] else {"errors": validation["errors"], "warnings": validation["warnings"]}
            else:
                item["validation_errors"] = {"errors": ["JSON-LD填充失败"], "warnings": []}
            validated.append(item)
        priority_order = {"high": 0, "medium": 1, "low": 2}
        validated.sort(key=lambda x: priority_order.get(x.get("priority", "medium"), 1))
        return validated

    def _generate_impact_description(self, schema_type: str, dimension: str) -> str:
        impacts = {
            "Organization": "增强品牌实体识别，提升AI搜索引擎对品牌的理解和引用概率",
            "Product": "提升产品在搜索结果中的富摘要展示，增加点击率和引用率",
            "FAQPage": "增加FAQ富摘要展示机会，提升在AI回答中的直接引用概率",
            "Article": "优化文章内容的结构化表达，提升AI搜索引擎的内容理解和引用",
            "LocalBusiness": "增强本地搜索可见性，提升地理位置相关查询的引用率",
        }
        return impacts.get(schema_type, f"提升{dimension}维度的得分和AI引用率")