"""Schema 工具集 - 结构化数据提取与生成 SchemaExtractTool: 从 HTML 中提取 JSON-LD / Microdata / RDFa 等结构化数据 SchemaGenerateTool: 生成 Schema.org JSON-LD 标记 """ import json import logging from typing import Any import httpx from agentkit.tools.base import Tool logger = logging.getLogger(__name__) # 检测 extruct 是否可用 _EXTRUCT_AVAILABLE = False extruct = None try: import extruct _EXTRUCT_AVAILABLE = True except ImportError: pass # 检测 pydantic_schemaorg 是否可用 _PYDANTIC_SCHEMAORG_AVAILABLE = False pydantic_schemaorg = None try: import pydantic_schemaorg _PYDANTIC_SCHEMAORG_AVAILABLE = True except ImportError: pass class SchemaExtractTool(Tool): """结构化数据提取工具 - 从 HTML 中提取 JSON-LD、Microdata、RDFa 等 使用 extruct 库进行提取,当 extruct 未安装时优雅降级。 """ SUPPORTED_FORMATS = {"json-ld", "microdata", "rdfa", "dublincore"} def __init__( self, name: str = "schema_extract", description: str = "从网页 HTML 中提取结构化数据(JSON-LD、Microdata、RDFa 等)", input_schema: dict[str, Any] | None = None, output_schema: dict[str, Any] | None = None, version: str = "1.0.0", tags: list[str] | None = None, ): super().__init__( name=name, description=description, input_schema=input_schema or self._default_input_schema(), output_schema=output_schema or self._default_output_schema(), version=version, tags=tags or ["schema", "extraction"], ) @staticmethod def _default_input_schema() -> dict[str, Any]: return { "type": "object", "properties": { "url_or_html": { "type": "string", "description": "要提取的 URL 或原始 HTML 字符串", }, "formats": { "type": "array", "items": {"type": "string"}, "description": "要提取的格式列表", "default": ["json-ld"], }, }, "required": ["url_or_html"], } @staticmethod def _default_output_schema() -> dict[str, Any]: return { "type": "object", "properties": { "schemas": { "type": "array", "items": { "type": "object", "properties": { "format": {"type": "string"}, "data": {"type": "object"}, }, }, "description": "提取到的结构化数据列表", }, "success": {"type": "boolean", "description": "是否成功"}, "error": {"type": "string", "description": "错误信息(仅失败时)"}, }, } def _is_url(self, text: str) -> bool: """判断输入是 URL 还是 HTML""" return text.strip().startswith(("http://", "https://")) async def execute(self, **kwargs) -> dict: """执行结构化数据提取 Args: url_or_html: URL 或原始 HTML 字符串(必需) formats: 要提取的格式列表(默认 ["json-ld"]) 可选: "json-ld", "microdata", "rdfa", "dublincore" Returns: 包含 schemas 列表和 success 布尔值的字典 """ url_or_html = kwargs.get("url_or_html") if not url_or_html: return {"error": "url_or_html 参数是必需的", "schemas": [], "success": False} formats = kwargs.get("formats", ["json-ld"]) # 验证格式 invalid_formats = set(formats) - self.SUPPORTED_FORMATS if invalid_formats: return { "error": f"不支持的格式: {invalid_formats},支持的格式: {self.SUPPORTED_FORMATS}", "schemas": [], "success": False, } # 优雅降级:extruct 未安装 if not _EXTRUCT_AVAILABLE: return { "error": "extruct not installed. Run: pip install extruct", "schemas": [], "success": False, } try: html = url_or_html url = None # 如果输入是 URL,先获取 HTML if self._is_url(url_or_html): url = url_or_html try: async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client: resp = await client.get(url, headers={"User-Agent": "AgentKit/1.0"}) html = resp.text except Exception as e: return { "error": f"获取 URL 内容失败: {e}", "schemas": [], "success": False, } # 使用 extruct 提取 data = extruct.extract( html, base_url=url or "", formats=formats, ) # 整理结果 schemas: list[dict[str, Any]] = [] for fmt in formats: items = data.get(fmt, []) if items: for item in items: schemas.append({"format": fmt, "data": item}) return {"schemas": schemas, "success": True} except Exception as e: logger.error(f"SchemaExtractTool 提取失败: {e}") return { "error": str(e), "schemas": [], "success": False, } class SchemaGenerateTool(Tool): """JSON-LD 结构化数据生成工具 - 为常见 Schema.org 类型生成标记 当 pydantic-schemaorg 可用时提供验证,否则手动构建 JSON-LD。 手动生成始终可用,无需外部依赖。 """ SUPPORTED_TYPES = { "Organization", "WebPage", "Article", "Product", "FAQPage", "HowTo", "LocalBusiness", "Person", "BreadcrumbList", "SiteNavigationElement", } def __init__( self, name: str = "schema_generate", description: str = "生成 Schema.org JSON-LD 结构化数据标记", input_schema: dict[str, Any] | None = None, output_schema: dict[str, Any] | None = None, version: str = "1.0.0", tags: list[str] | None = None, ): super().__init__( name=name, description=description, input_schema=input_schema or self._default_input_schema(), output_schema=output_schema or self._default_output_schema(), version=version, tags=tags or ["schema", "generation"], ) @staticmethod def _default_input_schema() -> dict[str, Any]: return { "type": "object", "properties": { "schema_type": { "type": "string", "description": "Schema.org 类型名称,如 Organization、FAQPage 等", }, "properties": { "type": "object", "description": "Schema 属性字典", }, }, "required": ["schema_type", "properties"], } @staticmethod def _default_output_schema() -> dict[str, Any]: return { "type": "object", "properties": { "jsonld": {"type": "string", "description": "生成的 JSON-LD 字符串"}, "schema_type": {"type": "string", "description": "Schema 类型"}, "success": {"type": "boolean", "description": "是否成功"}, "error": {"type": "string", "description": "错误信息(仅失败时)"}, }, } def _generate_manual(self, schema_type: str, properties: dict[str, Any]) -> str: """手动构建 JSON-LD(无需外部依赖)""" jsonld_obj: dict[str, Any] = { "@context": "https://schema.org", "@type": schema_type, } jsonld_obj.update(properties) return json.dumps(jsonld_obj, ensure_ascii=False, indent=2) def _generate_with_schemaorg(self, schema_type: str, properties: dict[str, Any]) -> str | None: """使用 pydantic-schemaorg 生成 JSON-LD(带验证)""" if not _PYDANTIC_SCHEMAORG_AVAILABLE: return None try: # 尝试获取对应的 pydantic_schemaorg 类 schema_cls = getattr(pydantic_schemaorg, schema_type, None) if schema_cls is None: return None instance = schema_cls(**properties) # pydantic_schemaorg 对象转 dict if hasattr(instance, "model_dump"): data = instance.model_dump(exclude_none=True) elif hasattr(instance, "dict"): data = instance.dict(exclude_none=True) else: return None jsonld_obj: dict[str, Any] = { "@context": "https://schema.org", "@type": schema_type, } jsonld_obj.update(data) return json.dumps(jsonld_obj, ensure_ascii=False, indent=2) except Exception: return None async def execute(self, **kwargs) -> dict: """执行 JSON-LD 生成 Args: schema_type: Schema.org 类型名称(必需,如 "Organization") properties: Schema 属性字典(必需) Returns: 包含 jsonld 字符串、schema_type 和 success 布尔值的字典 """ schema_type = kwargs.get("schema_type") properties = kwargs.get("properties") if not schema_type: return {"error": "schema_type 参数是必需的", "schema_type": "", "success": False} if properties is None: return {"error": "properties 参数是必需的", "schema_type": schema_type, "success": False} if not isinstance(properties, dict): return { "error": "properties 必须是字典类型", "schema_type": schema_type, "success": False, } # 验证 schema_type if schema_type not in self.SUPPORTED_TYPES: return { "error": f"不支持的 schema_type: {schema_type},支持的类型: {sorted(self.SUPPORTED_TYPES)}", "schema_type": schema_type, "success": False, } try: # 优先尝试使用 pydantic-schemaorg(带验证) jsonld = self._generate_with_schemaorg(schema_type, properties) # 降级到手动生成 if jsonld is None: jsonld = self._generate_manual(schema_type, properties) return { "jsonld": jsonld, "schema_type": schema_type, "success": True, } except Exception as e: logger.error(f"SchemaGenerateTool 生成失败: {e}") return { "error": str(e), "schema_type": schema_type, "success": False, }