fischer-agentkit/src/agentkit/tools/schema_tools.py

"""Schema 工具集 - 结构化数据提取与生成

SchemaExtractTool: 从 HTML 中提取 JSON-LD / Microdata / RDFa 等结构化数据
SchemaGenerateTool: 生成 Schema.org JSON-LD 标记
"""

import json
import logging
from typing import Any

from agentkit.tools.base import Tool

logger = logging.getLogger(__name__)

# 检测 extruct 是否可用
_EXTRUCT_AVAILABLE = False
extruct = None
try:
    import extruct

    _EXTRUCT_AVAILABLE = True
except ImportError:
    pass

# 检测 pydantic_schemaorg 是否可用
_PYDANTIC_SCHEMAORG_AVAILABLE = False
pydantic_schemaorg = None
try:
    import pydantic_schemaorg

    _PYDANTIC_SCHEMAORG_AVAILABLE = True
except ImportError:
    pass


class SchemaExtractTool(Tool):
    """结构化数据提取工具 - 从 HTML 中提取 JSON-LD、Microdata、RDFa 等

    使用 extruct 库进行提取，当 extruct 未安装时优雅降级。
    """

    SUPPORTED_FORMATS = {"json-ld", "microdata", "rdfa", "dublincore"}

    def __init__(
        self,
        name: str = "schema_extract",
        description: str = "从网页 HTML 中提取结构化数据（JSON-LD、Microdata、RDFa 等）",
        input_schema: dict[str, Any] | None = None,
        output_schema: dict[str, Any] | None = None,
        version: str = "1.0.0",
        tags: list[str] | None = None,
    ):
        super().__init__(
            name=name,
            description=description,
            input_schema=input_schema or self._default_input_schema(),
            output_schema=output_schema or self._default_output_schema(),
            version=version,
            tags=tags or ["schema", "extraction"],
        )

    @staticmethod
    def _default_input_schema() -> dict[str, Any]:
        return {
            "type": "object",
            "properties": {
                "url_or_html": {
                    "type": "string",
                    "description": "要提取的 URL 或原始 HTML 字符串",
                },
                "formats": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": "要提取的格式列表",
                    "default": ["json-ld"],
                },
            },
            "required": ["url_or_html"],
        }

    @staticmethod
    def _default_output_schema() -> dict[str, Any]:
        return {
            "type": "object",
            "properties": {
                "schemas": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "format": {"type": "string"},
                            "data": {"type": "object"},
                        },
                    },
                    "description": "提取到的结构化数据列表",
                },
                "success": {"type": "boolean", "description": "是否成功"},
                "error": {"type": "string", "description": "错误信息（仅失败时）"},
            },
        }

    def _is_url(self, text: str) -> bool:
        """判断输入是 URL 还是 HTML"""
        return text.strip().startswith(("http://", "https://"))

    async def execute(self, **kwargs) -> dict:
        """执行结构化数据提取

        Args:
            url_or_html: URL 或原始 HTML 字符串（必需）
            formats: 要提取的格式列表（默认 ["json-ld"]）
                     可选: "json-ld", "microdata", "rdfa", "dublincore"

        Returns:
            包含 schemas 列表和 success 布尔值的字典
        """
        url_or_html = kwargs.get("url_or_html")
        if not url_or_html:
            return {"error": "url_or_html 参数是必需的", "schemas": [], "success": False}

        formats = kwargs.get("formats", ["json-ld"])
        # 验证格式
        invalid_formats = set(formats) - self.SUPPORTED_FORMATS
        if invalid_formats:
            return {
                "error": f"不支持的格式: {invalid_formats}，支持的格式: {self.SUPPORTED_FORMATS}",
                "schemas": [],
                "success": False,
            }

        # 优雅降级：extruct 未安装
        if not _EXTRUCT_AVAILABLE:
            return {
                "error": "extruct not installed. Run: pip install extruct",
                "schemas": [],
                "success": False,
            }

        try:
            html = url_or_html
            url = None

            # 如果输入是 URL，先获取 HTML
            if self._is_url(url_or_html):
                url = url_or_html
                try:
                    import urllib.request

                    req = urllib.request.Request(url, headers={"User-Agent": "AgentKit/1.0"})
                    with urllib.request.urlopen(req, timeout=30) as resp:
                        html = resp.read().decode("utf-8", errors="replace")
                except Exception as e:
                    return {
                        "error": f"获取 URL 内容失败: {e}",
                        "schemas": [],
                        "success": False,
                    }

            # 使用 extruct 提取
            data = extruct.extract(
                html,
                base_url=url or "",
                formats=formats,
            )

            # 整理结果
            schemas: list[dict[str, Any]] = []
            for fmt in formats:
                items = data.get(fmt, [])
                if items:
                    for item in items:
                        schemas.append({"format": fmt, "data": item})

            return {"schemas": schemas, "success": True}

        except Exception as e:
            logger.error(f"SchemaExtractTool 提取失败: {e}")
            return {
                "error": str(e),
                "schemas": [],
                "success": False,
            }


class SchemaGenerateTool(Tool):
    """JSON-LD 结构化数据生成工具 - 为常见 Schema.org 类型生成标记

    当 pydantic-schemaorg 可用时提供验证，否则手动构建 JSON-LD。
    手动生成始终可用，无需外部依赖。
    """

    SUPPORTED_TYPES = {
        "Organization",
        "WebPage",
        "Article",
        "Product",
        "FAQPage",
        "HowTo",
        "LocalBusiness",
        "Person",
        "BreadcrumbList",
        "SiteNavigationElement",
    }

    def __init__(
        self,
        name: str = "schema_generate",
        description: str = "生成 Schema.org JSON-LD 结构化数据标记",
        input_schema: dict[str, Any] | None = None,
        output_schema: dict[str, Any] | None = None,
        version: str = "1.0.0",
        tags: list[str] | None = None,
    ):
        super().__init__(
            name=name,
            description=description,
            input_schema=input_schema or self._default_input_schema(),
            output_schema=output_schema or self._default_output_schema(),
            version=version,
            tags=tags or ["schema", "generation"],
        )

    @staticmethod
    def _default_input_schema() -> dict[str, Any]:
        return {
            "type": "object",
            "properties": {
                "schema_type": {
                    "type": "string",
                    "description": "Schema.org 类型名称，如 Organization、FAQPage 等",
                },
                "properties": {
                    "type": "object",
                    "description": "Schema 属性字典",
                },
            },
            "required": ["schema_type", "properties"],
        }

    @staticmethod
    def _default_output_schema() -> dict[str, Any]:
        return {
            "type": "object",
            "properties": {
                "jsonld": {"type": "string", "description": "生成的 JSON-LD 字符串"},
                "schema_type": {"type": "string", "description": "Schema 类型"},
                "success": {"type": "boolean", "description": "是否成功"},
                "error": {"type": "string", "description": "错误信息（仅失败时）"},
            },
        }

    def _generate_manual(self, schema_type: str, properties: dict[str, Any]) -> str:
        """手动构建 JSON-LD（无需外部依赖）"""
        jsonld_obj: dict[str, Any] = {
            "@context": "https://schema.org",
            "@type": schema_type,
        }
        jsonld_obj.update(properties)
        return json.dumps(jsonld_obj, ensure_ascii=False, indent=2)

    def _generate_with_schemaorg(self, schema_type: str, properties: dict[str, Any]) -> str | None:
        """使用 pydantic-schemaorg 生成 JSON-LD（带验证）"""
        if not _PYDANTIC_SCHEMAORG_AVAILABLE:
            return None

        try:
            # 尝试获取对应的 pydantic_schemaorg 类
            schema_cls = getattr(pydantic_schemaorg, schema_type, None)
            if schema_cls is None:
                return None

            instance = schema_cls(**properties)
            # pydantic_schemaorg 对象转 dict
            if hasattr(instance, "model_dump"):
                data = instance.model_dump(exclude_none=True)
            elif hasattr(instance, "dict"):
                data = instance.dict(exclude_none=True)
            else:
                return None

            jsonld_obj: dict[str, Any] = {
                "@context": "https://schema.org",
                "@type": schema_type,
            }
            jsonld_obj.update(data)
            return json.dumps(jsonld_obj, ensure_ascii=False, indent=2)
        except Exception:
            return None

    async def execute(self, **kwargs) -> dict:
        """执行 JSON-LD 生成

        Args:
            schema_type: Schema.org 类型名称（必需，如 "Organization"）
            properties: Schema 属性字典（必需）

        Returns:
            包含 jsonld 字符串、schema_type 和 success 布尔值的字典
        """
        schema_type = kwargs.get("schema_type")
        properties = kwargs.get("properties")

        if not schema_type:
            return {"error": "schema_type 参数是必需的", "schema_type": "", "success": False}

        if properties is None:
            return {"error": "properties 参数是必需的", "schema_type": schema_type, "success": False}

        if not isinstance(properties, dict):
            return {
                "error": "properties 必须是字典类型",
                "schema_type": schema_type,
                "success": False,
            }

        # 验证 schema_type
        if schema_type not in self.SUPPORTED_TYPES:
            return {
                "error": f"不支持的 schema_type: {schema_type}，支持的类型: {sorted(self.SUPPORTED_TYPES)}",
                "schema_type": schema_type,
                "success": False,
            }

        try:
            # 优先尝试使用 pydantic-schemaorg（带验证）
            jsonld = self._generate_with_schemaorg(schema_type, properties)

            # 降级到手动生成
            if jsonld is None:
                jsonld = self._generate_manual(schema_type, properties)

            return {
                "jsonld": jsonld,
                "schema_type": schema_type,
                "success": True,
            }

        except Exception as e:
            logger.error(f"SchemaGenerateTool 生成失败: {e}")
            return {
                "error": str(e),
                "schema_type": schema_type,
                "success": False,
            }