345 lines
11 KiB
Python
345 lines
11 KiB
Python
"""Schema 工具集 - 结构化数据提取与生成
|
||
|
||
SchemaExtractTool: 从 HTML 中提取 JSON-LD / Microdata / RDFa 等结构化数据
|
||
SchemaGenerateTool: 生成 Schema.org JSON-LD 标记
|
||
"""
|
||
|
||
import json
|
||
import logging
|
||
from typing import Any
|
||
|
||
from agentkit.tools.base import Tool
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# 检测 extruct 是否可用
|
||
_EXTRUCT_AVAILABLE = False
|
||
extruct = None
|
||
try:
|
||
import extruct
|
||
|
||
_EXTRUCT_AVAILABLE = True
|
||
except ImportError:
|
||
pass
|
||
|
||
# 检测 pydantic_schemaorg 是否可用
|
||
_PYDANTIC_SCHEMAORG_AVAILABLE = False
|
||
pydantic_schemaorg = None
|
||
try:
|
||
import pydantic_schemaorg
|
||
|
||
_PYDANTIC_SCHEMAORG_AVAILABLE = True
|
||
except ImportError:
|
||
pass
|
||
|
||
|
||
class SchemaExtractTool(Tool):
|
||
"""结构化数据提取工具 - 从 HTML 中提取 JSON-LD、Microdata、RDFa 等
|
||
|
||
使用 extruct 库进行提取,当 extruct 未安装时优雅降级。
|
||
"""
|
||
|
||
SUPPORTED_FORMATS = {"json-ld", "microdata", "rdfa", "dublincore"}
|
||
|
||
def __init__(
|
||
self,
|
||
name: str = "schema_extract",
|
||
description: str = "从网页 HTML 中提取结构化数据(JSON-LD、Microdata、RDFa 等)",
|
||
input_schema: dict[str, Any] | None = None,
|
||
output_schema: dict[str, Any] | None = None,
|
||
version: str = "1.0.0",
|
||
tags: list[str] | None = None,
|
||
):
|
||
super().__init__(
|
||
name=name,
|
||
description=description,
|
||
input_schema=input_schema or self._default_input_schema(),
|
||
output_schema=output_schema or self._default_output_schema(),
|
||
version=version,
|
||
tags=tags or ["schema", "extraction"],
|
||
)
|
||
|
||
@staticmethod
|
||
def _default_input_schema() -> dict[str, Any]:
|
||
return {
|
||
"type": "object",
|
||
"properties": {
|
||
"url_or_html": {
|
||
"type": "string",
|
||
"description": "要提取的 URL 或原始 HTML 字符串",
|
||
},
|
||
"formats": {
|
||
"type": "array",
|
||
"items": {"type": "string"},
|
||
"description": "要提取的格式列表",
|
||
"default": ["json-ld"],
|
||
},
|
||
},
|
||
"required": ["url_or_html"],
|
||
}
|
||
|
||
@staticmethod
|
||
def _default_output_schema() -> dict[str, Any]:
|
||
return {
|
||
"type": "object",
|
||
"properties": {
|
||
"schemas": {
|
||
"type": "array",
|
||
"items": {
|
||
"type": "object",
|
||
"properties": {
|
||
"format": {"type": "string"},
|
||
"data": {"type": "object"},
|
||
},
|
||
},
|
||
"description": "提取到的结构化数据列表",
|
||
},
|
||
"success": {"type": "boolean", "description": "是否成功"},
|
||
"error": {"type": "string", "description": "错误信息(仅失败时)"},
|
||
},
|
||
}
|
||
|
||
def _is_url(self, text: str) -> bool:
|
||
"""判断输入是 URL 还是 HTML"""
|
||
return text.strip().startswith(("http://", "https://"))
|
||
|
||
async def execute(self, **kwargs) -> dict:
|
||
"""执行结构化数据提取
|
||
|
||
Args:
|
||
url_or_html: URL 或原始 HTML 字符串(必需)
|
||
formats: 要提取的格式列表(默认 ["json-ld"])
|
||
可选: "json-ld", "microdata", "rdfa", "dublincore"
|
||
|
||
Returns:
|
||
包含 schemas 列表和 success 布尔值的字典
|
||
"""
|
||
url_or_html = kwargs.get("url_or_html")
|
||
if not url_or_html:
|
||
return {"error": "url_or_html 参数是必需的", "schemas": [], "success": False}
|
||
|
||
formats = kwargs.get("formats", ["json-ld"])
|
||
# 验证格式
|
||
invalid_formats = set(formats) - self.SUPPORTED_FORMATS
|
||
if invalid_formats:
|
||
return {
|
||
"error": f"不支持的格式: {invalid_formats},支持的格式: {self.SUPPORTED_FORMATS}",
|
||
"schemas": [],
|
||
"success": False,
|
||
}
|
||
|
||
# 优雅降级:extruct 未安装
|
||
if not _EXTRUCT_AVAILABLE:
|
||
return {
|
||
"error": "extruct not installed. Run: pip install extruct",
|
||
"schemas": [],
|
||
"success": False,
|
||
}
|
||
|
||
try:
|
||
html = url_or_html
|
||
url = None
|
||
|
||
# 如果输入是 URL,先获取 HTML
|
||
if self._is_url(url_or_html):
|
||
url = url_or_html
|
||
try:
|
||
import urllib.request
|
||
|
||
req = urllib.request.Request(url, headers={"User-Agent": "AgentKit/1.0"})
|
||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||
html = resp.read().decode("utf-8", errors="replace")
|
||
except Exception as e:
|
||
return {
|
||
"error": f"获取 URL 内容失败: {e}",
|
||
"schemas": [],
|
||
"success": False,
|
||
}
|
||
|
||
# 使用 extruct 提取
|
||
data = extruct.extract(
|
||
html,
|
||
base_url=url or "",
|
||
formats=formats,
|
||
)
|
||
|
||
# 整理结果
|
||
schemas: list[dict[str, Any]] = []
|
||
for fmt in formats:
|
||
items = data.get(fmt, [])
|
||
if items:
|
||
for item in items:
|
||
schemas.append({"format": fmt, "data": item})
|
||
|
||
return {"schemas": schemas, "success": True}
|
||
|
||
except Exception as e:
|
||
logger.error(f"SchemaExtractTool 提取失败: {e}")
|
||
return {
|
||
"error": str(e),
|
||
"schemas": [],
|
||
"success": False,
|
||
}
|
||
|
||
|
||
class SchemaGenerateTool(Tool):
|
||
"""JSON-LD 结构化数据生成工具 - 为常见 Schema.org 类型生成标记
|
||
|
||
当 pydantic-schemaorg 可用时提供验证,否则手动构建 JSON-LD。
|
||
手动生成始终可用,无需外部依赖。
|
||
"""
|
||
|
||
SUPPORTED_TYPES = {
|
||
"Organization",
|
||
"WebPage",
|
||
"Article",
|
||
"Product",
|
||
"FAQPage",
|
||
"HowTo",
|
||
"LocalBusiness",
|
||
"Person",
|
||
"BreadcrumbList",
|
||
"SiteNavigationElement",
|
||
}
|
||
|
||
def __init__(
|
||
self,
|
||
name: str = "schema_generate",
|
||
description: str = "生成 Schema.org JSON-LD 结构化数据标记",
|
||
input_schema: dict[str, Any] | None = None,
|
||
output_schema: dict[str, Any] | None = None,
|
||
version: str = "1.0.0",
|
||
tags: list[str] | None = None,
|
||
):
|
||
super().__init__(
|
||
name=name,
|
||
description=description,
|
||
input_schema=input_schema or self._default_input_schema(),
|
||
output_schema=output_schema or self._default_output_schema(),
|
||
version=version,
|
||
tags=tags or ["schema", "generation"],
|
||
)
|
||
|
||
@staticmethod
|
||
def _default_input_schema() -> dict[str, Any]:
|
||
return {
|
||
"type": "object",
|
||
"properties": {
|
||
"schema_type": {
|
||
"type": "string",
|
||
"description": "Schema.org 类型名称,如 Organization、FAQPage 等",
|
||
},
|
||
"properties": {
|
||
"type": "object",
|
||
"description": "Schema 属性字典",
|
||
},
|
||
},
|
||
"required": ["schema_type", "properties"],
|
||
}
|
||
|
||
@staticmethod
|
||
def _default_output_schema() -> dict[str, Any]:
|
||
return {
|
||
"type": "object",
|
||
"properties": {
|
||
"jsonld": {"type": "string", "description": "生成的 JSON-LD 字符串"},
|
||
"schema_type": {"type": "string", "description": "Schema 类型"},
|
||
"success": {"type": "boolean", "description": "是否成功"},
|
||
"error": {"type": "string", "description": "错误信息(仅失败时)"},
|
||
},
|
||
}
|
||
|
||
def _generate_manual(self, schema_type: str, properties: dict[str, Any]) -> str:
|
||
"""手动构建 JSON-LD(无需外部依赖)"""
|
||
jsonld_obj: dict[str, Any] = {
|
||
"@context": "https://schema.org",
|
||
"@type": schema_type,
|
||
}
|
||
jsonld_obj.update(properties)
|
||
return json.dumps(jsonld_obj, ensure_ascii=False, indent=2)
|
||
|
||
def _generate_with_schemaorg(self, schema_type: str, properties: dict[str, Any]) -> str | None:
|
||
"""使用 pydantic-schemaorg 生成 JSON-LD(带验证)"""
|
||
if not _PYDANTIC_SCHEMAORG_AVAILABLE:
|
||
return None
|
||
|
||
try:
|
||
# 尝试获取对应的 pydantic_schemaorg 类
|
||
schema_cls = getattr(pydantic_schemaorg, schema_type, None)
|
||
if schema_cls is None:
|
||
return None
|
||
|
||
instance = schema_cls(**properties)
|
||
# pydantic_schemaorg 对象转 dict
|
||
if hasattr(instance, "model_dump"):
|
||
data = instance.model_dump(exclude_none=True)
|
||
elif hasattr(instance, "dict"):
|
||
data = instance.dict(exclude_none=True)
|
||
else:
|
||
return None
|
||
|
||
jsonld_obj: dict[str, Any] = {
|
||
"@context": "https://schema.org",
|
||
"@type": schema_type,
|
||
}
|
||
jsonld_obj.update(data)
|
||
return json.dumps(jsonld_obj, ensure_ascii=False, indent=2)
|
||
except Exception:
|
||
return None
|
||
|
||
async def execute(self, **kwargs) -> dict:
|
||
"""执行 JSON-LD 生成
|
||
|
||
Args:
|
||
schema_type: Schema.org 类型名称(必需,如 "Organization")
|
||
properties: Schema 属性字典(必需)
|
||
|
||
Returns:
|
||
包含 jsonld 字符串、schema_type 和 success 布尔值的字典
|
||
"""
|
||
schema_type = kwargs.get("schema_type")
|
||
properties = kwargs.get("properties")
|
||
|
||
if not schema_type:
|
||
return {"error": "schema_type 参数是必需的", "schema_type": "", "success": False}
|
||
|
||
if properties is None:
|
||
return {"error": "properties 参数是必需的", "schema_type": schema_type, "success": False}
|
||
|
||
if not isinstance(properties, dict):
|
||
return {
|
||
"error": "properties 必须是字典类型",
|
||
"schema_type": schema_type,
|
||
"success": False,
|
||
}
|
||
|
||
# 验证 schema_type
|
||
if schema_type not in self.SUPPORTED_TYPES:
|
||
return {
|
||
"error": f"不支持的 schema_type: {schema_type},支持的类型: {sorted(self.SUPPORTED_TYPES)}",
|
||
"schema_type": schema_type,
|
||
"success": False,
|
||
}
|
||
|
||
try:
|
||
# 优先尝试使用 pydantic-schemaorg(带验证)
|
||
jsonld = self._generate_with_schemaorg(schema_type, properties)
|
||
|
||
# 降级到手动生成
|
||
if jsonld is None:
|
||
jsonld = self._generate_manual(schema_type, properties)
|
||
|
||
return {
|
||
"jsonld": jsonld,
|
||
"schema_type": schema_type,
|
||
"success": True,
|
||
}
|
||
|
||
except Exception as e:
|
||
logger.error(f"SchemaGenerateTool 生成失败: {e}")
|
||
return {
|
||
"error": str(e),
|
||
"schema_type": schema_type,
|
||
"success": False,
|
||
}
|