feat(tools): U3 built-in Python tools - WebCrawl, SchemaExtract, SchemaGenerate
Add WebCrawlTool (Crawl4AI wrapper with graceful degradation), SchemaExtractTool (extruct-based Schema.org extraction), and SchemaGenerateTool (JSON-LD generation with optional pydantic-schemaorg validation). All tools work without optional dependencies.
This commit is contained in:
parent
550d29a139
commit
9ec1740047
|
|
@ -6,6 +6,9 @@ from agentkit.tools.agent_tool import AgentTool
|
||||||
from agentkit.tools.mcp_tool import MCPTool
|
from agentkit.tools.mcp_tool import MCPTool
|
||||||
from agentkit.tools.registry import ToolRegistry
|
from agentkit.tools.registry import ToolRegistry
|
||||||
from agentkit.tools.composition import SequentialChain, ParallelFanOut, DynamicSelector
|
from agentkit.tools.composition import SequentialChain, ParallelFanOut, DynamicSelector
|
||||||
|
from agentkit.tools.web_crawl import WebCrawlTool
|
||||||
|
from agentkit.tools.schema_tools import SchemaExtractTool, SchemaGenerateTool
|
||||||
|
from agentkit.tools.baidu_search import BaiduSearchTool
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"Tool",
|
"Tool",
|
||||||
|
|
@ -16,4 +19,8 @@ __all__ = [
|
||||||
"SequentialChain",
|
"SequentialChain",
|
||||||
"ParallelFanOut",
|
"ParallelFanOut",
|
||||||
"DynamicSelector",
|
"DynamicSelector",
|
||||||
|
"WebCrawlTool",
|
||||||
|
"SchemaExtractTool",
|
||||||
|
"SchemaGenerateTool",
|
||||||
|
"BaiduSearchTool",
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,344 @@
|
||||||
|
"""Schema 工具集 - 结构化数据提取与生成
|
||||||
|
|
||||||
|
SchemaExtractTool: 从 HTML 中提取 JSON-LD / Microdata / RDFa 等结构化数据
|
||||||
|
SchemaGenerateTool: 生成 Schema.org JSON-LD 标记
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from agentkit.tools.base import Tool
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# 检测 extruct 是否可用
|
||||||
|
_EXTRUCT_AVAILABLE = False
|
||||||
|
extruct = None
|
||||||
|
try:
|
||||||
|
import extruct
|
||||||
|
|
||||||
|
_EXTRUCT_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# 检测 pydantic_schemaorg 是否可用
|
||||||
|
_PYDANTIC_SCHEMAORG_AVAILABLE = False
|
||||||
|
pydantic_schemaorg = None
|
||||||
|
try:
|
||||||
|
import pydantic_schemaorg
|
||||||
|
|
||||||
|
_PYDANTIC_SCHEMAORG_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class SchemaExtractTool(Tool):
|
||||||
|
"""结构化数据提取工具 - 从 HTML 中提取 JSON-LD、Microdata、RDFa 等
|
||||||
|
|
||||||
|
使用 extruct 库进行提取,当 extruct 未安装时优雅降级。
|
||||||
|
"""
|
||||||
|
|
||||||
|
SUPPORTED_FORMATS = {"json-ld", "microdata", "rdfa", "dublincore"}
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
name: str = "schema_extract",
|
||||||
|
description: str = "从网页 HTML 中提取结构化数据(JSON-LD、Microdata、RDFa 等)",
|
||||||
|
input_schema: dict[str, Any] | None = None,
|
||||||
|
output_schema: dict[str, Any] | None = None,
|
||||||
|
version: str = "1.0.0",
|
||||||
|
tags: list[str] | None = None,
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
name=name,
|
||||||
|
description=description,
|
||||||
|
input_schema=input_schema or self._default_input_schema(),
|
||||||
|
output_schema=output_schema or self._default_output_schema(),
|
||||||
|
version=version,
|
||||||
|
tags=tags or ["schema", "extraction"],
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _default_input_schema() -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"url_or_html": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "要提取的 URL 或原始 HTML 字符串",
|
||||||
|
},
|
||||||
|
"formats": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {"type": "string"},
|
||||||
|
"description": "要提取的格式列表",
|
||||||
|
"default": ["json-ld"],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"required": ["url_or_html"],
|
||||||
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _default_output_schema() -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"schemas": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"format": {"type": "string"},
|
||||||
|
"data": {"type": "object"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"description": "提取到的结构化数据列表",
|
||||||
|
},
|
||||||
|
"success": {"type": "boolean", "description": "是否成功"},
|
||||||
|
"error": {"type": "string", "description": "错误信息(仅失败时)"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def _is_url(self, text: str) -> bool:
|
||||||
|
"""判断输入是 URL 还是 HTML"""
|
||||||
|
return text.strip().startswith(("http://", "https://"))
|
||||||
|
|
||||||
|
async def execute(self, **kwargs) -> dict:
|
||||||
|
"""执行结构化数据提取
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url_or_html: URL 或原始 HTML 字符串(必需)
|
||||||
|
formats: 要提取的格式列表(默认 ["json-ld"])
|
||||||
|
可选: "json-ld", "microdata", "rdfa", "dublincore"
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
包含 schemas 列表和 success 布尔值的字典
|
||||||
|
"""
|
||||||
|
url_or_html = kwargs.get("url_or_html")
|
||||||
|
if not url_or_html:
|
||||||
|
return {"error": "url_or_html 参数是必需的", "schemas": [], "success": False}
|
||||||
|
|
||||||
|
formats = kwargs.get("formats", ["json-ld"])
|
||||||
|
# 验证格式
|
||||||
|
invalid_formats = set(formats) - self.SUPPORTED_FORMATS
|
||||||
|
if invalid_formats:
|
||||||
|
return {
|
||||||
|
"error": f"不支持的格式: {invalid_formats},支持的格式: {self.SUPPORTED_FORMATS}",
|
||||||
|
"schemas": [],
|
||||||
|
"success": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
# 优雅降级:extruct 未安装
|
||||||
|
if not _EXTRUCT_AVAILABLE:
|
||||||
|
return {
|
||||||
|
"error": "extruct not installed. Run: pip install extruct",
|
||||||
|
"schemas": [],
|
||||||
|
"success": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
html = url_or_html
|
||||||
|
url = None
|
||||||
|
|
||||||
|
# 如果输入是 URL,先获取 HTML
|
||||||
|
if self._is_url(url_or_html):
|
||||||
|
url = url_or_html
|
||||||
|
try:
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
req = urllib.request.Request(url, headers={"User-Agent": "AgentKit/1.0"})
|
||||||
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||||
|
html = resp.read().decode("utf-8", errors="replace")
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"error": f"获取 URL 内容失败: {e}",
|
||||||
|
"schemas": [],
|
||||||
|
"success": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
# 使用 extruct 提取
|
||||||
|
data = extruct.extract(
|
||||||
|
html,
|
||||||
|
base_url=url or "",
|
||||||
|
formats=formats,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 整理结果
|
||||||
|
schemas: list[dict[str, Any]] = []
|
||||||
|
for fmt in formats:
|
||||||
|
items = data.get(fmt, [])
|
||||||
|
if items:
|
||||||
|
for item in items:
|
||||||
|
schemas.append({"format": fmt, "data": item})
|
||||||
|
|
||||||
|
return {"schemas": schemas, "success": True}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"SchemaExtractTool 提取失败: {e}")
|
||||||
|
return {
|
||||||
|
"error": str(e),
|
||||||
|
"schemas": [],
|
||||||
|
"success": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class SchemaGenerateTool(Tool):
|
||||||
|
"""JSON-LD 结构化数据生成工具 - 为常见 Schema.org 类型生成标记
|
||||||
|
|
||||||
|
当 pydantic-schemaorg 可用时提供验证,否则手动构建 JSON-LD。
|
||||||
|
手动生成始终可用,无需外部依赖。
|
||||||
|
"""
|
||||||
|
|
||||||
|
SUPPORTED_TYPES = {
|
||||||
|
"Organization",
|
||||||
|
"WebPage",
|
||||||
|
"Article",
|
||||||
|
"Product",
|
||||||
|
"FAQPage",
|
||||||
|
"HowTo",
|
||||||
|
"LocalBusiness",
|
||||||
|
"Person",
|
||||||
|
"BreadcrumbList",
|
||||||
|
"SiteNavigationElement",
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
name: str = "schema_generate",
|
||||||
|
description: str = "生成 Schema.org JSON-LD 结构化数据标记",
|
||||||
|
input_schema: dict[str, Any] | None = None,
|
||||||
|
output_schema: dict[str, Any] | None = None,
|
||||||
|
version: str = "1.0.0",
|
||||||
|
tags: list[str] | None = None,
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
name=name,
|
||||||
|
description=description,
|
||||||
|
input_schema=input_schema or self._default_input_schema(),
|
||||||
|
output_schema=output_schema or self._default_output_schema(),
|
||||||
|
version=version,
|
||||||
|
tags=tags or ["schema", "generation"],
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _default_input_schema() -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"schema_type": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Schema.org 类型名称,如 Organization、FAQPage 等",
|
||||||
|
},
|
||||||
|
"properties": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Schema 属性字典",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"required": ["schema_type", "properties"],
|
||||||
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _default_output_schema() -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"jsonld": {"type": "string", "description": "生成的 JSON-LD 字符串"},
|
||||||
|
"schema_type": {"type": "string", "description": "Schema 类型"},
|
||||||
|
"success": {"type": "boolean", "description": "是否成功"},
|
||||||
|
"error": {"type": "string", "description": "错误信息(仅失败时)"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def _generate_manual(self, schema_type: str, properties: dict[str, Any]) -> str:
|
||||||
|
"""手动构建 JSON-LD(无需外部依赖)"""
|
||||||
|
jsonld_obj: dict[str, Any] = {
|
||||||
|
"@context": "https://schema.org",
|
||||||
|
"@type": schema_type,
|
||||||
|
}
|
||||||
|
jsonld_obj.update(properties)
|
||||||
|
return json.dumps(jsonld_obj, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
def _generate_with_schemaorg(self, schema_type: str, properties: dict[str, Any]) -> str | None:
|
||||||
|
"""使用 pydantic-schemaorg 生成 JSON-LD(带验证)"""
|
||||||
|
if not _PYDANTIC_SCHEMAORG_AVAILABLE:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 尝试获取对应的 pydantic_schemaorg 类
|
||||||
|
schema_cls = getattr(pydantic_schemaorg, schema_type, None)
|
||||||
|
if schema_cls is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
instance = schema_cls(**properties)
|
||||||
|
# pydantic_schemaorg 对象转 dict
|
||||||
|
if hasattr(instance, "model_dump"):
|
||||||
|
data = instance.model_dump(exclude_none=True)
|
||||||
|
elif hasattr(instance, "dict"):
|
||||||
|
data = instance.dict(exclude_none=True)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
jsonld_obj: dict[str, Any] = {
|
||||||
|
"@context": "https://schema.org",
|
||||||
|
"@type": schema_type,
|
||||||
|
}
|
||||||
|
jsonld_obj.update(data)
|
||||||
|
return json.dumps(jsonld_obj, ensure_ascii=False, indent=2)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def execute(self, **kwargs) -> dict:
|
||||||
|
"""执行 JSON-LD 生成
|
||||||
|
|
||||||
|
Args:
|
||||||
|
schema_type: Schema.org 类型名称(必需,如 "Organization")
|
||||||
|
properties: Schema 属性字典(必需)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
包含 jsonld 字符串、schema_type 和 success 布尔值的字典
|
||||||
|
"""
|
||||||
|
schema_type = kwargs.get("schema_type")
|
||||||
|
properties = kwargs.get("properties")
|
||||||
|
|
||||||
|
if not schema_type:
|
||||||
|
return {"error": "schema_type 参数是必需的", "schema_type": "", "success": False}
|
||||||
|
|
||||||
|
if properties is None:
|
||||||
|
return {"error": "properties 参数是必需的", "schema_type": schema_type, "success": False}
|
||||||
|
|
||||||
|
if not isinstance(properties, dict):
|
||||||
|
return {
|
||||||
|
"error": "properties 必须是字典类型",
|
||||||
|
"schema_type": schema_type,
|
||||||
|
"success": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
# 验证 schema_type
|
||||||
|
if schema_type not in self.SUPPORTED_TYPES:
|
||||||
|
return {
|
||||||
|
"error": f"不支持的 schema_type: {schema_type},支持的类型: {sorted(self.SUPPORTED_TYPES)}",
|
||||||
|
"schema_type": schema_type,
|
||||||
|
"success": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 优先尝试使用 pydantic-schemaorg(带验证)
|
||||||
|
jsonld = self._generate_with_schemaorg(schema_type, properties)
|
||||||
|
|
||||||
|
# 降级到手动生成
|
||||||
|
if jsonld is None:
|
||||||
|
jsonld = self._generate_manual(schema_type, properties)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"jsonld": jsonld,
|
||||||
|
"schema_type": schema_type,
|
||||||
|
"success": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"SchemaGenerateTool 生成失败: {e}")
|
||||||
|
return {
|
||||||
|
"error": str(e),
|
||||||
|
"schema_type": schema_type,
|
||||||
|
"success": False,
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,159 @@
|
||||||
|
"""WebCrawlTool - 基于 Crawl4AI 的网页抓取工具,支持优雅降级"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from agentkit.tools.base import Tool
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# 检测 Crawl4AI 是否可用
|
||||||
|
_CRAWL4AI_AVAILABLE = False
|
||||||
|
AsyncWebCrawler = None
|
||||||
|
JsonCssExtractionStrategy = None
|
||||||
|
try:
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||||
|
|
||||||
|
_CRAWL4AI_AVAILABLE = True
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class WebCrawlTool(Tool):
|
||||||
|
"""网页抓取工具 - 使用 Crawl4AI,可选依赖未安装时优雅降级
|
||||||
|
|
||||||
|
支持 Markdown/HTML 输出、CSS 选择器提取、JS 渲染等待。
|
||||||
|
当 Crawl4AI 未安装时,返回包含安装提示的错误信息。
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
name: str = "web_crawl",
|
||||||
|
description: str = "抓取网页内容,支持 Markdown/HTML 输出和 CSS 选择器提取",
|
||||||
|
input_schema: dict[str, Any] | None = None,
|
||||||
|
output_schema: dict[str, Any] | None = None,
|
||||||
|
version: str = "1.0.0",
|
||||||
|
tags: list[str] | None = None,
|
||||||
|
):
|
||||||
|
super().__init__(
|
||||||
|
name=name,
|
||||||
|
description=description,
|
||||||
|
input_schema=input_schema or self._default_input_schema(),
|
||||||
|
output_schema=output_schema or self._default_output_schema(),
|
||||||
|
version=version,
|
||||||
|
tags=tags or ["web", "crawl"],
|
||||||
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _default_input_schema() -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"url": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "要抓取的 URL",
|
||||||
|
},
|
||||||
|
"format": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "输出格式:markdown 或 html",
|
||||||
|
"default": "markdown",
|
||||||
|
"enum": ["markdown", "html"],
|
||||||
|
},
|
||||||
|
"css_selector": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "可选的 CSS 选择器,用于结构化提取",
|
||||||
|
},
|
||||||
|
"js_wait": {
|
||||||
|
"type": "number",
|
||||||
|
"description": "等待 JS 渲染的秒数",
|
||||||
|
"default": 0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"required": ["url"],
|
||||||
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _default_output_schema() -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"content": {"type": "string", "description": "抓取到的内容"},
|
||||||
|
"status_code": {"type": "integer", "description": "HTTP 状态码"},
|
||||||
|
"links": {"type": "array", "items": {"type": "string"}, "description": "页面中的链接"},
|
||||||
|
"success": {"type": "boolean", "description": "是否成功"},
|
||||||
|
"error": {"type": "string", "description": "错误信息(仅失败时)"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
async def execute(self, **kwargs) -> dict:
|
||||||
|
"""执行网页抓取
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: 要抓取的 URL(必需)
|
||||||
|
format: 输出格式 - "markdown" 或 "html"(默认 "markdown")
|
||||||
|
css_selector: 可选的 CSS 选择器,用于结构化提取
|
||||||
|
js_wait: 等待 JS 渲染的秒数(默认 0)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
包含 content, status_code, links, success 的字典
|
||||||
|
"""
|
||||||
|
url = kwargs.get("url")
|
||||||
|
if not url:
|
||||||
|
return {"error": "url 参数是必需的", "success": False}
|
||||||
|
|
||||||
|
output_format = kwargs.get("format", "markdown")
|
||||||
|
css_selector = kwargs.get("css_selector")
|
||||||
|
js_wait = kwargs.get("js_wait", 0)
|
||||||
|
|
||||||
|
# 优雅降级:Crawl4AI 未安装
|
||||||
|
if not _CRAWL4AI_AVAILABLE:
|
||||||
|
return {
|
||||||
|
"error": "Crawl4AI not installed. Run: pip install crawl4ai",
|
||||||
|
"success": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
extraction_strategy = None
|
||||||
|
if css_selector:
|
||||||
|
extraction_strategy = JsonCssExtractionStrategy(css_selector)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
url=url,
|
||||||
|
extraction_strategy=extraction_strategy,
|
||||||
|
js_wait=js_wait if js_wait else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 提取内容
|
||||||
|
if output_format == "html":
|
||||||
|
content = result.html or ""
|
||||||
|
else:
|
||||||
|
content = result.markdown or ""
|
||||||
|
|
||||||
|
# 提取链接
|
||||||
|
links: list[str] = []
|
||||||
|
if hasattr(result, "links") and result.links:
|
||||||
|
links = result.links if isinstance(result.links, list) else []
|
||||||
|
|
||||||
|
status_code = result.status_code if hasattr(result, "status_code") else 200
|
||||||
|
|
||||||
|
response: dict[str, Any] = {
|
||||||
|
"content": content,
|
||||||
|
"status_code": status_code,
|
||||||
|
"links": links,
|
||||||
|
"success": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
# 如果使用了 CSS 选择器提取,附加提取结果
|
||||||
|
if extraction_strategy and hasattr(result, "extracted_content") and result.extracted_content:
|
||||||
|
response["extracted"] = result.extracted_content
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"WebCrawlTool 抓取失败: {url} - {e}")
|
||||||
|
return {
|
||||||
|
"error": str(e),
|
||||||
|
"success": False,
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,413 @@
|
||||||
|
"""Schema 工具集单元测试 - SchemaExtractTool + SchemaGenerateTool"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from unittest.mock import AsyncMock, MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from agentkit.tools.schema_tools import SchemaExtractTool, SchemaGenerateTool
|
||||||
|
|
||||||
|
|
||||||
|
# ========== SchemaExtractTool 测试 ==========
|
||||||
|
|
||||||
|
|
||||||
|
class TestSchemaExtractToolConstruction:
|
||||||
|
"""测试 SchemaExtractTool 构造"""
|
||||||
|
|
||||||
|
def test_default_construction(self):
|
||||||
|
tool = SchemaExtractTool()
|
||||||
|
assert tool.name == "schema_extract"
|
||||||
|
assert tool.input_schema is not None
|
||||||
|
assert tool.output_schema is not None
|
||||||
|
assert "url_or_html" in tool.input_schema["properties"]
|
||||||
|
assert tool.input_schema["required"] == ["url_or_html"]
|
||||||
|
|
||||||
|
def test_custom_construction(self):
|
||||||
|
tool = SchemaExtractTool(
|
||||||
|
name="my_extractor",
|
||||||
|
description="自定义提取器",
|
||||||
|
version="2.0.0",
|
||||||
|
)
|
||||||
|
assert tool.name == "my_extractor"
|
||||||
|
|
||||||
|
def test_supported_formats(self):
|
||||||
|
tool = SchemaExtractTool()
|
||||||
|
assert "json-ld" in tool.SUPPORTED_FORMATS
|
||||||
|
assert "microdata" in tool.SUPPORTED_FORMATS
|
||||||
|
assert "rdfa" in tool.SUPPORTED_FORMATS
|
||||||
|
assert "dublincore" in tool.SUPPORTED_FORMATS
|
||||||
|
|
||||||
|
def test_to_dict(self):
|
||||||
|
tool = SchemaExtractTool()
|
||||||
|
d = tool.to_dict()
|
||||||
|
assert d["name"] == "schema_extract"
|
||||||
|
|
||||||
|
|
||||||
|
class TestSchemaExtractToolGracefulDegradation:
|
||||||
|
"""测试 extruct 不可用时的优雅降级"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_execute_without_extruct(self):
|
||||||
|
with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", False):
|
||||||
|
tool = SchemaExtractTool()
|
||||||
|
result = await tool.execute(url_or_html="<html></html>")
|
||||||
|
assert result["success"] is False
|
||||||
|
assert "extruct not installed" in result["error"]
|
||||||
|
assert "pip install extruct" in result["error"]
|
||||||
|
assert result["schemas"] == []
|
||||||
|
|
||||||
|
|
||||||
|
class TestSchemaExtractToolValidation:
|
||||||
|
"""测试输入验证"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_execute_missing_url_or_html(self):
|
||||||
|
tool = SchemaExtractTool()
|
||||||
|
result = await tool.execute()
|
||||||
|
assert result["success"] is False
|
||||||
|
assert "url_or_html" in result["error"]
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_execute_empty_url_or_html(self):
|
||||||
|
tool = SchemaExtractTool()
|
||||||
|
result = await tool.execute(url_or_html="")
|
||||||
|
assert result["success"] is False
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_execute_invalid_format(self):
|
||||||
|
with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True):
|
||||||
|
tool = SchemaExtractTool()
|
||||||
|
result = await tool.execute(url_or_html="<html></html>", formats=["invalid-format"])
|
||||||
|
assert result["success"] is False
|
||||||
|
assert "不支持" in result["error"] or "invalid" in result["error"].lower()
|
||||||
|
|
||||||
|
|
||||||
|
class TestSchemaExtractToolWithMockedExtruct:
|
||||||
|
"""使用 mock extruct 测试提取逻辑"""
|
||||||
|
|
||||||
|
SAMPLE_HTML_WITH_JSONLD = """
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<script type="application/ld+json">
|
||||||
|
{
|
||||||
|
"@context": "https://schema.org",
|
||||||
|
"@type": "Organization",
|
||||||
|
"name": "Test Corp"
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
</head>
|
||||||
|
<body></body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_extract_jsonld_from_html(self):
|
||||||
|
"""测试从 HTML 中提取 JSON-LD"""
|
||||||
|
mock_extruct = MagicMock()
|
||||||
|
mock_extruct.extract.return_value = {
|
||||||
|
"json-ld": [
|
||||||
|
{"@context": "https://schema.org", "@type": "Organization", "name": "Test Corp"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \
|
||||||
|
patch("agentkit.tools.schema_tools.extruct", mock_extruct):
|
||||||
|
tool = SchemaExtractTool()
|
||||||
|
result = await tool.execute(url_or_html=self.SAMPLE_HTML_WITH_JSONLD)
|
||||||
|
assert result["success"] is True
|
||||||
|
assert len(result["schemas"]) == 1
|
||||||
|
assert result["schemas"][0]["format"] == "json-ld"
|
||||||
|
assert result["schemas"][0]["data"]["@type"] == "Organization"
|
||||||
|
assert result["schemas"][0]["data"]["name"] == "Test Corp"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_extract_no_schema_data(self):
|
||||||
|
"""测试 HTML 中没有结构化数据"""
|
||||||
|
mock_extruct = MagicMock()
|
||||||
|
mock_extruct.extract.return_value = {"json-ld": []}
|
||||||
|
|
||||||
|
with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \
|
||||||
|
patch("agentkit.tools.schema_tools.extruct", mock_extruct):
|
||||||
|
tool = SchemaExtractTool()
|
||||||
|
result = await tool.execute(url_or_html="<html><body>No schema</body></html>")
|
||||||
|
assert result["success"] is True
|
||||||
|
assert result["schemas"] == []
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_extract_multiple_formats(self):
|
||||||
|
"""测试同时提取多种格式"""
|
||||||
|
mock_extruct = MagicMock()
|
||||||
|
mock_extruct.extract.return_value = {
|
||||||
|
"json-ld": [{"@type": "Organization", "name": "Corp"}],
|
||||||
|
"microdata": [{"type": "Product", "name": "Item"}],
|
||||||
|
}
|
||||||
|
|
||||||
|
with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \
|
||||||
|
patch("agentkit.tools.schema_tools.extruct", mock_extruct):
|
||||||
|
tool = SchemaExtractTool()
|
||||||
|
result = await tool.execute(
|
||||||
|
url_or_html="<html></html>",
|
||||||
|
formats=["json-ld", "microdata"],
|
||||||
|
)
|
||||||
|
assert result["success"] is True
|
||||||
|
assert len(result["schemas"]) == 2
|
||||||
|
formats_found = {s["format"] for s in result["schemas"]}
|
||||||
|
assert "json-ld" in formats_found
|
||||||
|
assert "microdata" in formats_found
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_extract_error_handling(self):
|
||||||
|
"""测试提取异常处理"""
|
||||||
|
mock_extruct = MagicMock()
|
||||||
|
mock_extruct.extract.side_effect = Exception("Parse error")
|
||||||
|
|
||||||
|
with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \
|
||||||
|
patch("agentkit.tools.schema_tools.extruct", mock_extruct):
|
||||||
|
tool = SchemaExtractTool()
|
||||||
|
result = await tool.execute(url_or_html="<html></html>")
|
||||||
|
assert result["success"] is False
|
||||||
|
assert "Parse error" in result["error"]
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_extract_with_url(self):
|
||||||
|
"""测试从 URL 提取(需要先获取 HTML)"""
|
||||||
|
mock_extruct = MagicMock()
|
||||||
|
mock_extruct.extract.return_value = {
|
||||||
|
"json-ld": [{"@type": "WebPage"}]
|
||||||
|
}
|
||||||
|
|
||||||
|
with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \
|
||||||
|
patch("agentkit.tools.schema_tools.extruct", mock_extruct), \
|
||||||
|
patch("urllib.request.urlopen") as mock_urlopen:
|
||||||
|
mock_resp = MagicMock()
|
||||||
|
mock_resp.read.return_value = b"<html><body>Test</body></html>"
|
||||||
|
mock_resp.__enter__ = MagicMock(return_value=mock_resp)
|
||||||
|
mock_resp.__exit__ = MagicMock(return_value=None)
|
||||||
|
mock_urlopen.return_value = mock_resp
|
||||||
|
|
||||||
|
tool = SchemaExtractTool()
|
||||||
|
result = await tool.execute(url_or_html="https://example.com")
|
||||||
|
assert result["success"] is True
|
||||||
|
|
||||||
|
|
||||||
|
# ========== SchemaGenerateTool 测试 ==========
|
||||||
|
|
||||||
|
|
||||||
|
class TestSchemaGenerateToolConstruction:
|
||||||
|
"""测试 SchemaGenerateTool 构造"""
|
||||||
|
|
||||||
|
def test_default_construction(self):
|
||||||
|
tool = SchemaGenerateTool()
|
||||||
|
assert tool.name == "schema_generate"
|
||||||
|
assert tool.input_schema is not None
|
||||||
|
assert tool.output_schema is not None
|
||||||
|
assert "schema_type" in tool.input_schema["properties"]
|
||||||
|
assert "properties" in tool.input_schema["properties"]
|
||||||
|
|
||||||
|
def test_supported_types(self):
|
||||||
|
tool = SchemaGenerateTool()
|
||||||
|
assert "Organization" in tool.SUPPORTED_TYPES
|
||||||
|
assert "FAQPage" in tool.SUPPORTED_TYPES
|
||||||
|
assert "Article" in tool.SUPPORTED_TYPES
|
||||||
|
assert "Product" in tool.SUPPORTED_TYPES
|
||||||
|
assert "HowTo" in tool.SUPPORTED_TYPES
|
||||||
|
assert "LocalBusiness" in tool.SUPPORTED_TYPES
|
||||||
|
assert "Person" in tool.SUPPORTED_TYPES
|
||||||
|
assert "BreadcrumbList" in tool.SUPPORTED_TYPES
|
||||||
|
assert "SiteNavigationElement" in tool.SUPPORTED_TYPES
|
||||||
|
assert "WebPage" in tool.SUPPORTED_TYPES
|
||||||
|
|
||||||
|
|
||||||
|
class TestSchemaGenerateToolValidation:
|
||||||
|
"""测试输入验证"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_execute_missing_schema_type(self):
|
||||||
|
tool = SchemaGenerateTool()
|
||||||
|
result = await tool.execute(properties={"name": "Test"})
|
||||||
|
assert result["success"] is False
|
||||||
|
assert "schema_type" in result["error"]
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_execute_missing_properties(self):
|
||||||
|
tool = SchemaGenerateTool()
|
||||||
|
result = await tool.execute(schema_type="Organization")
|
||||||
|
assert result["success"] is False
|
||||||
|
assert "properties" in result["error"]
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_execute_invalid_schema_type(self):
|
||||||
|
tool = SchemaGenerateTool()
|
||||||
|
result = await tool.execute(schema_type="InvalidType", properties={"name": "Test"})
|
||||||
|
assert result["success"] is False
|
||||||
|
assert "不支持" in result["error"] or "InvalidType" in result["error"]
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_execute_properties_not_dict(self):
|
||||||
|
tool = SchemaGenerateTool()
|
||||||
|
result = await tool.execute(schema_type="Organization", properties="not a dict")
|
||||||
|
assert result["success"] is False
|
||||||
|
assert "字典" in result["error"] or "dict" in result["error"].lower()
|
||||||
|
|
||||||
|
|
||||||
|
class TestSchemaGenerateToolManualGeneration:
|
||||||
|
"""测试手动 JSON-LD 生成(始终可用,无需外部依赖)"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_generate_organization(self):
|
||||||
|
"""测试生成 Organization 类型"""
|
||||||
|
with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
|
||||||
|
tool = SchemaGenerateTool()
|
||||||
|
result = await tool.execute(
|
||||||
|
schema_type="Organization",
|
||||||
|
properties={"name": "Fischer AI", "url": "https://fischer.ai"},
|
||||||
|
)
|
||||||
|
assert result["success"] is True
|
||||||
|
assert result["schema_type"] == "Organization"
|
||||||
|
|
||||||
|
jsonld = json.loads(result["jsonld"])
|
||||||
|
assert jsonld["@context"] == "https://schema.org"
|
||||||
|
assert jsonld["@type"] == "Organization"
|
||||||
|
assert jsonld["name"] == "Fischer AI"
|
||||||
|
assert jsonld["url"] == "https://fischer.ai"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_generate_faq_page(self):
|
||||||
|
"""测试生成 FAQPage 类型"""
|
||||||
|
with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
|
||||||
|
tool = SchemaGenerateTool()
|
||||||
|
result = await tool.execute(
|
||||||
|
schema_type="FAQPage",
|
||||||
|
properties={
|
||||||
|
"mainEntity": [
|
||||||
|
{
|
||||||
|
"@type": "Question",
|
||||||
|
"name": "What is GEO?",
|
||||||
|
"acceptedAnswer": {
|
||||||
|
"@type": "Answer",
|
||||||
|
"text": "Generative Engine Optimization",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert result["success"] is True
|
||||||
|
jsonld = json.loads(result["jsonld"])
|
||||||
|
assert jsonld["@type"] == "FAQPage"
|
||||||
|
assert len(jsonld["mainEntity"]) == 1
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_generate_article(self):
|
||||||
|
"""测试生成 Article 类型"""
|
||||||
|
with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
|
||||||
|
tool = SchemaGenerateTool()
|
||||||
|
result = await tool.execute(
|
||||||
|
schema_type="Article",
|
||||||
|
properties={
|
||||||
|
"headline": "Test Article",
|
||||||
|
"author": {"@type": "Person", "name": "John"},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert result["success"] is True
|
||||||
|
jsonld = json.loads(result["jsonld"])
|
||||||
|
assert jsonld["@type"] == "Article"
|
||||||
|
assert jsonld["headline"] == "Test Article"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_generate_breadcrumb_list(self):
|
||||||
|
"""测试生成 BreadcrumbList 类型"""
|
||||||
|
with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
|
||||||
|
tool = SchemaGenerateTool()
|
||||||
|
result = await tool.execute(
|
||||||
|
schema_type="BreadcrumbList",
|
||||||
|
properties={
|
||||||
|
"itemListElement": [
|
||||||
|
{"@type": "ListItem", "position": 1, "name": "Home"},
|
||||||
|
]
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert result["success"] is True
|
||||||
|
jsonld = json.loads(result["jsonld"])
|
||||||
|
assert jsonld["@type"] == "BreadcrumbList"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_output_is_valid_jsonld(self):
|
||||||
|
"""测试输出是有效的 JSON-LD(包含 @context 和 @type)"""
|
||||||
|
with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
|
||||||
|
tool = SchemaGenerateTool()
|
||||||
|
for schema_type in ["Organization", "WebPage", "Product", "Person"]:
|
||||||
|
result = await tool.execute(
|
||||||
|
schema_type=schema_type,
|
||||||
|
properties={"name": f"Test {schema_type}"},
|
||||||
|
)
|
||||||
|
assert result["success"] is True
|
||||||
|
jsonld = json.loads(result["jsonld"])
|
||||||
|
assert "@context" in jsonld
|
||||||
|
assert jsonld["@context"] == "https://schema.org"
|
||||||
|
assert "@type" in jsonld
|
||||||
|
assert jsonld["@type"] == schema_type
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_manual_generation_preserves_chinese(self):
|
||||||
|
"""测试手动生成保留中文字符"""
|
||||||
|
with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
|
||||||
|
tool = SchemaGenerateTool()
|
||||||
|
result = await tool.execute(
|
||||||
|
schema_type="Organization",
|
||||||
|
properties={"name": "费舍尔科技", "description": "AI 驱动的企业平台"},
|
||||||
|
)
|
||||||
|
assert result["success"] is True
|
||||||
|
jsonld = json.loads(result["jsonld"])
|
||||||
|
assert jsonld["name"] == "费舍尔科技"
|
||||||
|
assert jsonld["description"] == "AI 驱动的企业平台"
|
||||||
|
|
||||||
|
|
||||||
|
class TestSchemaGenerateToolWithPydanticSchemaorg:
|
||||||
|
"""测试 pydantic-schemaorg 可用时的行为"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_fallback_to_manual_when_schemaorg_fails(self):
|
||||||
|
"""当 pydantic-schemaorg 构建失败时,降级到手动生成"""
|
||||||
|
mock_schemaorg = MagicMock()
|
||||||
|
# 让 getattr 返回 None,模拟类型不存在
|
||||||
|
mock_schemaorg.Organization = None
|
||||||
|
|
||||||
|
with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", True), \
|
||||||
|
patch("agentkit.tools.schema_tools.pydantic_schemaorg", mock_schemaorg):
|
||||||
|
tool = SchemaGenerateTool()
|
||||||
|
result = await tool.execute(
|
||||||
|
schema_type="Organization",
|
||||||
|
properties={"name": "Test"},
|
||||||
|
)
|
||||||
|
# 应该降级到手动生成
|
||||||
|
assert result["success"] is True
|
||||||
|
jsonld = json.loads(result["jsonld"])
|
||||||
|
assert jsonld["@type"] == "Organization"
|
||||||
|
assert jsonld["name"] == "Test"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_schemaorg_not_available_uses_manual(self):
|
||||||
|
"""当 pydantic-schemaorg 不可用时,使用手动生成"""
|
||||||
|
with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
|
||||||
|
tool = SchemaGenerateTool()
|
||||||
|
result = await tool.execute(
|
||||||
|
schema_type="Organization",
|
||||||
|
properties={"name": "Manual Corp"},
|
||||||
|
)
|
||||||
|
assert result["success"] is True
|
||||||
|
jsonld = json.loads(result["jsonld"])
|
||||||
|
assert jsonld["name"] == "Manual Corp"
|
||||||
|
|
||||||
|
|
||||||
|
class TestSchemaGenerateToolSafeExecute:
|
||||||
|
"""测试 safe_execute 钩子"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_safe_execute_success(self):
|
||||||
|
with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
|
||||||
|
tool = SchemaGenerateTool()
|
||||||
|
result = await tool.safe_execute(
|
||||||
|
schema_type="Organization",
|
||||||
|
properties={"name": "Test"},
|
||||||
|
)
|
||||||
|
assert result["success"] is True
|
||||||
|
|
@ -0,0 +1,201 @@
|
||||||
|
"""WebCrawlTool 单元测试"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import types
|
||||||
|
from unittest.mock import AsyncMock, MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from agentkit.tools.web_crawl import WebCrawlTool
|
||||||
|
|
||||||
|
|
||||||
|
class TestWebCrawlToolConstruction:
|
||||||
|
"""测试 WebCrawlTool 构造"""
|
||||||
|
|
||||||
|
def test_default_construction(self):
|
||||||
|
tool = WebCrawlTool()
|
||||||
|
assert tool.name == "web_crawl"
|
||||||
|
assert "抓取" in tool.description or "crawl" in tool.description.lower()
|
||||||
|
assert tool.input_schema is not None
|
||||||
|
assert tool.output_schema is not None
|
||||||
|
assert "url" in tool.input_schema["properties"]
|
||||||
|
assert tool.input_schema["required"] == ["url"]
|
||||||
|
|
||||||
|
def test_custom_construction(self):
|
||||||
|
tool = WebCrawlTool(
|
||||||
|
name="my_crawler",
|
||||||
|
description="自定义爬虫",
|
||||||
|
version="2.0.0",
|
||||||
|
tags=["custom"],
|
||||||
|
)
|
||||||
|
assert tool.name == "my_crawler"
|
||||||
|
assert tool.description == "自定义爬虫"
|
||||||
|
assert tool.version == "2.0.0"
|
||||||
|
assert tool.tags == ["custom"]
|
||||||
|
|
||||||
|
def test_to_dict(self):
|
||||||
|
tool = WebCrawlTool()
|
||||||
|
d = tool.to_dict()
|
||||||
|
assert d["name"] == "web_crawl"
|
||||||
|
assert "input_schema" in d
|
||||||
|
assert "output_schema" in d
|
||||||
|
|
||||||
|
def test_repr(self):
|
||||||
|
tool = WebCrawlTool()
|
||||||
|
r = repr(tool)
|
||||||
|
assert "WebCrawlTool" in r
|
||||||
|
assert "web_crawl" in r
|
||||||
|
|
||||||
|
|
||||||
|
class TestWebCrawlToolGracefulDegradation:
|
||||||
|
"""测试 Crawl4AI 不可用时的优雅降级"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_execute_without_crawl4ai(self):
|
||||||
|
"""当 Crawl4AI 未安装时,返回安装提示"""
|
||||||
|
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", False):
|
||||||
|
tool = WebCrawlTool()
|
||||||
|
result = await tool.execute(url="https://example.com")
|
||||||
|
assert result["success"] is False
|
||||||
|
assert "Crawl4AI not installed" in result["error"]
|
||||||
|
assert "pip install crawl4ai" in result["error"]
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_safe_execute_without_crawl4ai(self):
|
||||||
|
"""safe_execute 在 Crawl4AI 不可用时也应正常返回"""
|
||||||
|
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", False):
|
||||||
|
tool = WebCrawlTool()
|
||||||
|
result = await tool.safe_execute(url="https://example.com")
|
||||||
|
assert result["success"] is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestWebCrawlToolValidation:
|
||||||
|
"""测试输入验证"""
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_execute_missing_url(self):
|
||||||
|
tool = WebCrawlTool()
|
||||||
|
result = await tool.execute()
|
||||||
|
assert result["success"] is False
|
||||||
|
assert "url" in result["error"]
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_execute_empty_url(self):
|
||||||
|
tool = WebCrawlTool()
|
||||||
|
result = await tool.execute(url="")
|
||||||
|
assert result["success"] is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestWebCrawlToolWithMockedCrawl4AI:
|
||||||
|
"""使用 mock Crawl4AI 测试正常抓取逻辑"""
|
||||||
|
|
||||||
|
def _make_mock_crawler(self, markdown="# Hello", html="<h1>Hello</h1>", links=None, status_code=200):
|
||||||
|
"""创建 mock AsyncWebCrawler"""
|
||||||
|
mock_result = MagicMock()
|
||||||
|
mock_result.markdown = markdown
|
||||||
|
mock_result.html = html
|
||||||
|
mock_result.links = links or ["https://example.com/page1"]
|
||||||
|
mock_result.status_code = status_code
|
||||||
|
mock_result.extracted_content = None
|
||||||
|
|
||||||
|
mock_crawler = AsyncMock()
|
||||||
|
mock_crawler.arun = AsyncMock(return_value=mock_result)
|
||||||
|
mock_crawler.__aenter__ = AsyncMock(return_value=mock_crawler)
|
||||||
|
mock_crawler.__aexit__ = AsyncMock(return_value=None)
|
||||||
|
|
||||||
|
return mock_crawler, mock_result
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_execute_markdown_format(self):
|
||||||
|
"""测试 Markdown 格式输出"""
|
||||||
|
mock_crawler, _ = self._make_mock_crawler(markdown="# Test Page")
|
||||||
|
|
||||||
|
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
|
||||||
|
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
|
||||||
|
tool = WebCrawlTool()
|
||||||
|
result = await tool.execute(url="https://example.com", format="markdown")
|
||||||
|
assert result["success"] is True
|
||||||
|
assert result["content"] == "# Test Page"
|
||||||
|
assert result["status_code"] == 200
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_execute_html_format(self):
|
||||||
|
"""测试 HTML 格式输出"""
|
||||||
|
mock_crawler, _ = self._make_mock_crawler(html="<h1>Test</h1>")
|
||||||
|
|
||||||
|
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
|
||||||
|
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
|
||||||
|
tool = WebCrawlTool()
|
||||||
|
result = await tool.execute(url="https://example.com", format="html")
|
||||||
|
assert result["success"] is True
|
||||||
|
assert result["content"] == "<h1>Test</h1>"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_execute_with_links(self):
|
||||||
|
"""测试链接提取"""
|
||||||
|
mock_crawler, _ = self._make_mock_crawler(links=["https://example.com/a", "https://example.com/b"])
|
||||||
|
|
||||||
|
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
|
||||||
|
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
|
||||||
|
tool = WebCrawlTool()
|
||||||
|
result = await tool.execute(url="https://example.com")
|
||||||
|
assert result["success"] is True
|
||||||
|
assert len(result["links"]) == 2
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_execute_with_css_selector(self):
|
||||||
|
"""测试 CSS 选择器提取"""
|
||||||
|
mock_crawler, mock_result = self._make_mock_crawler()
|
||||||
|
mock_result.extracted_content = '{"title": "Test"}'
|
||||||
|
|
||||||
|
mock_strategy_cls = MagicMock(return_value=MagicMock())
|
||||||
|
|
||||||
|
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
|
||||||
|
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler), \
|
||||||
|
patch("agentkit.tools.web_crawl.JsonCssExtractionStrategy", mock_strategy_cls):
|
||||||
|
tool = WebCrawlTool()
|
||||||
|
result = await tool.execute(url="https://example.com", css_selector="h1")
|
||||||
|
assert result["success"] is True
|
||||||
|
assert "extracted" in result
|
||||||
|
mock_strategy_cls.assert_called_once_with("h1")
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_execute_with_js_wait(self):
|
||||||
|
"""测试 JS 等待参数"""
|
||||||
|
mock_crawler, _ = self._make_mock_crawler()
|
||||||
|
|
||||||
|
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
|
||||||
|
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
|
||||||
|
tool = WebCrawlTool()
|
||||||
|
result = await tool.execute(url="https://example.com", js_wait=2)
|
||||||
|
assert result["success"] is True
|
||||||
|
# 验证 arun 被调用时传入了 js_wait 参数
|
||||||
|
call_kwargs = mock_crawler.arun.call_args
|
||||||
|
assert call_kwargs[1].get("js_wait") == 2 or call_kwargs[1].get("js_wait") is not None
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_execute_crawl_error(self):
|
||||||
|
"""测试抓取异常处理"""
|
||||||
|
mock_crawler = AsyncMock()
|
||||||
|
mock_crawler.arun = AsyncMock(side_effect=Exception("Connection timeout"))
|
||||||
|
mock_crawler.__aenter__ = AsyncMock(return_value=mock_crawler)
|
||||||
|
mock_crawler.__aexit__ = AsyncMock(return_value=None)
|
||||||
|
|
||||||
|
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
|
||||||
|
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
|
||||||
|
tool = WebCrawlTool()
|
||||||
|
result = await tool.execute(url="https://example.com")
|
||||||
|
assert result["success"] is False
|
||||||
|
assert "Connection timeout" in result["error"]
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_execute_default_format_is_markdown(self):
|
||||||
|
"""测试默认输出格式为 markdown"""
|
||||||
|
mock_crawler, _ = self._make_mock_crawler(markdown="MD content", html="HTML content")
|
||||||
|
|
||||||
|
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
|
||||||
|
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
|
||||||
|
tool = WebCrawlTool()
|
||||||
|
result = await tool.execute(url="https://example.com")
|
||||||
|
assert result["success"] is True
|
||||||
|
assert result["content"] == "MD content"
|
||||||
Loading…
Reference in New Issue