fischer-agentkit/src/agentkit/tools/schema_tools.py

345 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Schema 工具集 - 结构化数据提取与生成
SchemaExtractTool: 从 HTML 中提取 JSON-LD / Microdata / RDFa 等结构化数据
SchemaGenerateTool: 生成 Schema.org JSON-LD 标记
"""
import json
import logging
from typing import Any
from agentkit.tools.base import Tool
logger = logging.getLogger(__name__)
# 检测 extruct 是否可用
_EXTRUCT_AVAILABLE = False
extruct = None
try:
import extruct
_EXTRUCT_AVAILABLE = True
except ImportError:
pass
# 检测 pydantic_schemaorg 是否可用
_PYDANTIC_SCHEMAORG_AVAILABLE = False
pydantic_schemaorg = None
try:
import pydantic_schemaorg
_PYDANTIC_SCHEMAORG_AVAILABLE = True
except ImportError:
pass
class SchemaExtractTool(Tool):
"""结构化数据提取工具 - 从 HTML 中提取 JSON-LD、Microdata、RDFa 等
使用 extruct 库进行提取,当 extruct 未安装时优雅降级。
"""
SUPPORTED_FORMATS = {"json-ld", "microdata", "rdfa", "dublincore"}
def __init__(
self,
name: str = "schema_extract",
description: str = "从网页 HTML 中提取结构化数据JSON-LD、Microdata、RDFa 等)",
input_schema: dict[str, Any] | None = None,
output_schema: dict[str, Any] | None = None,
version: str = "1.0.0",
tags: list[str] | None = None,
):
super().__init__(
name=name,
description=description,
input_schema=input_schema or self._default_input_schema(),
output_schema=output_schema or self._default_output_schema(),
version=version,
tags=tags or ["schema", "extraction"],
)
@staticmethod
def _default_input_schema() -> dict[str, Any]:
return {
"type": "object",
"properties": {
"url_or_html": {
"type": "string",
"description": "要提取的 URL 或原始 HTML 字符串",
},
"formats": {
"type": "array",
"items": {"type": "string"},
"description": "要提取的格式列表",
"default": ["json-ld"],
},
},
"required": ["url_or_html"],
}
@staticmethod
def _default_output_schema() -> dict[str, Any]:
return {
"type": "object",
"properties": {
"schemas": {
"type": "array",
"items": {
"type": "object",
"properties": {
"format": {"type": "string"},
"data": {"type": "object"},
},
},
"description": "提取到的结构化数据列表",
},
"success": {"type": "boolean", "description": "是否成功"},
"error": {"type": "string", "description": "错误信息(仅失败时)"},
},
}
def _is_url(self, text: str) -> bool:
"""判断输入是 URL 还是 HTML"""
return text.strip().startswith(("http://", "https://"))
async def execute(self, **kwargs) -> dict:
"""执行结构化数据提取
Args:
url_or_html: URL 或原始 HTML 字符串(必需)
formats: 要提取的格式列表(默认 ["json-ld"]
可选: "json-ld", "microdata", "rdfa", "dublincore"
Returns:
包含 schemas 列表和 success 布尔值的字典
"""
url_or_html = kwargs.get("url_or_html")
if not url_or_html:
return {"error": "url_or_html 参数是必需的", "schemas": [], "success": False}
formats = kwargs.get("formats", ["json-ld"])
# 验证格式
invalid_formats = set(formats) - self.SUPPORTED_FORMATS
if invalid_formats:
return {
"error": f"不支持的格式: {invalid_formats},支持的格式: {self.SUPPORTED_FORMATS}",
"schemas": [],
"success": False,
}
# 优雅降级extruct 未安装
if not _EXTRUCT_AVAILABLE:
return {
"error": "extruct not installed. Run: pip install extruct",
"schemas": [],
"success": False,
}
try:
html = url_or_html
url = None
# 如果输入是 URL先获取 HTML
if self._is_url(url_or_html):
url = url_or_html
try:
import urllib.request
req = urllib.request.Request(url, headers={"User-Agent": "AgentKit/1.0"})
with urllib.request.urlopen(req, timeout=30) as resp:
html = resp.read().decode("utf-8", errors="replace")
except Exception as e:
return {
"error": f"获取 URL 内容失败: {e}",
"schemas": [],
"success": False,
}
# 使用 extruct 提取
data = extruct.extract(
html,
base_url=url or "",
formats=formats,
)
# 整理结果
schemas: list[dict[str, Any]] = []
for fmt in formats:
items = data.get(fmt, [])
if items:
for item in items:
schemas.append({"format": fmt, "data": item})
return {"schemas": schemas, "success": True}
except Exception as e:
logger.error(f"SchemaExtractTool 提取失败: {e}")
return {
"error": str(e),
"schemas": [],
"success": False,
}
class SchemaGenerateTool(Tool):
"""JSON-LD 结构化数据生成工具 - 为常见 Schema.org 类型生成标记
当 pydantic-schemaorg 可用时提供验证,否则手动构建 JSON-LD。
手动生成始终可用,无需外部依赖。
"""
SUPPORTED_TYPES = {
"Organization",
"WebPage",
"Article",
"Product",
"FAQPage",
"HowTo",
"LocalBusiness",
"Person",
"BreadcrumbList",
"SiteNavigationElement",
}
def __init__(
self,
name: str = "schema_generate",
description: str = "生成 Schema.org JSON-LD 结构化数据标记",
input_schema: dict[str, Any] | None = None,
output_schema: dict[str, Any] | None = None,
version: str = "1.0.0",
tags: list[str] | None = None,
):
super().__init__(
name=name,
description=description,
input_schema=input_schema or self._default_input_schema(),
output_schema=output_schema or self._default_output_schema(),
version=version,
tags=tags or ["schema", "generation"],
)
@staticmethod
def _default_input_schema() -> dict[str, Any]:
return {
"type": "object",
"properties": {
"schema_type": {
"type": "string",
"description": "Schema.org 类型名称,如 Organization、FAQPage 等",
},
"properties": {
"type": "object",
"description": "Schema 属性字典",
},
},
"required": ["schema_type", "properties"],
}
@staticmethod
def _default_output_schema() -> dict[str, Any]:
return {
"type": "object",
"properties": {
"jsonld": {"type": "string", "description": "生成的 JSON-LD 字符串"},
"schema_type": {"type": "string", "description": "Schema 类型"},
"success": {"type": "boolean", "description": "是否成功"},
"error": {"type": "string", "description": "错误信息(仅失败时)"},
},
}
def _generate_manual(self, schema_type: str, properties: dict[str, Any]) -> str:
"""手动构建 JSON-LD无需外部依赖"""
jsonld_obj: dict[str, Any] = {
"@context": "https://schema.org",
"@type": schema_type,
}
jsonld_obj.update(properties)
return json.dumps(jsonld_obj, ensure_ascii=False, indent=2)
def _generate_with_schemaorg(self, schema_type: str, properties: dict[str, Any]) -> str | None:
"""使用 pydantic-schemaorg 生成 JSON-LD带验证"""
if not _PYDANTIC_SCHEMAORG_AVAILABLE:
return None
try:
# 尝试获取对应的 pydantic_schemaorg 类
schema_cls = getattr(pydantic_schemaorg, schema_type, None)
if schema_cls is None:
return None
instance = schema_cls(**properties)
# pydantic_schemaorg 对象转 dict
if hasattr(instance, "model_dump"):
data = instance.model_dump(exclude_none=True)
elif hasattr(instance, "dict"):
data = instance.dict(exclude_none=True)
else:
return None
jsonld_obj: dict[str, Any] = {
"@context": "https://schema.org",
"@type": schema_type,
}
jsonld_obj.update(data)
return json.dumps(jsonld_obj, ensure_ascii=False, indent=2)
except Exception:
return None
async def execute(self, **kwargs) -> dict:
"""执行 JSON-LD 生成
Args:
schema_type: Schema.org 类型名称(必需,如 "Organization"
properties: Schema 属性字典(必需)
Returns:
包含 jsonld 字符串、schema_type 和 success 布尔值的字典
"""
schema_type = kwargs.get("schema_type")
properties = kwargs.get("properties")
if not schema_type:
return {"error": "schema_type 参数是必需的", "schema_type": "", "success": False}
if properties is None:
return {"error": "properties 参数是必需的", "schema_type": schema_type, "success": False}
if not isinstance(properties, dict):
return {
"error": "properties 必须是字典类型",
"schema_type": schema_type,
"success": False,
}
# 验证 schema_type
if schema_type not in self.SUPPORTED_TYPES:
return {
"error": f"不支持的 schema_type: {schema_type},支持的类型: {sorted(self.SUPPORTED_TYPES)}",
"schema_type": schema_type,
"success": False,
}
try:
# 优先尝试使用 pydantic-schemaorg带验证
jsonld = self._generate_with_schemaorg(schema_type, properties)
# 降级到手动生成
if jsonld is None:
jsonld = self._generate_manual(schema_type, properties)
return {
"jsonld": jsonld,
"schema_type": schema_type,
"success": True,
}
except Exception as e:
logger.error(f"SchemaGenerateTool 生成失败: {e}")
return {
"error": str(e),
"schema_type": schema_type,
"success": False,
}