feat(tools): U3 built-in Python tools - WebCrawl, SchemaExtract, SchemaGenerate

Add WebCrawlTool (Crawl4AI wrapper with graceful degradation),
SchemaExtractTool (extruct-based Schema.org extraction), and
SchemaGenerateTool (JSON-LD generation with optional pydantic-schemaorg
validation). All tools work without optional dependencies.
This commit is contained in:
chiguyong 2026-06-07 17:25:24 +08:00
parent 550d29a139
commit 9ec1740047
5 changed files with 1124 additions and 0 deletions

View File

@ -6,6 +6,9 @@ from agentkit.tools.agent_tool import AgentTool
from agentkit.tools.mcp_tool import MCPTool
from agentkit.tools.registry import ToolRegistry
from agentkit.tools.composition import SequentialChain, ParallelFanOut, DynamicSelector
from agentkit.tools.web_crawl import WebCrawlTool
from agentkit.tools.schema_tools import SchemaExtractTool, SchemaGenerateTool
from agentkit.tools.baidu_search import BaiduSearchTool
__all__ = [
"Tool",
@ -16,4 +19,8 @@ __all__ = [
"SequentialChain",
"ParallelFanOut",
"DynamicSelector",
"WebCrawlTool",
"SchemaExtractTool",
"SchemaGenerateTool",
"BaiduSearchTool",
]

View File

@ -0,0 +1,344 @@
"""Schema 工具集 - 结构化数据提取与生成
SchemaExtractTool: HTML 中提取 JSON-LD / Microdata / RDFa 等结构化数据
SchemaGenerateTool: 生成 Schema.org JSON-LD 标记
"""
import json
import logging
from typing import Any
from agentkit.tools.base import Tool
logger = logging.getLogger(__name__)
# 检测 extruct 是否可用
_EXTRUCT_AVAILABLE = False
extruct = None
try:
import extruct
_EXTRUCT_AVAILABLE = True
except ImportError:
pass
# 检测 pydantic_schemaorg 是否可用
_PYDANTIC_SCHEMAORG_AVAILABLE = False
pydantic_schemaorg = None
try:
import pydantic_schemaorg
_PYDANTIC_SCHEMAORG_AVAILABLE = True
except ImportError:
pass
class SchemaExtractTool(Tool):
"""结构化数据提取工具 - 从 HTML 中提取 JSON-LD、Microdata、RDFa 等
使用 extruct 库进行提取 extruct 未安装时优雅降级
"""
SUPPORTED_FORMATS = {"json-ld", "microdata", "rdfa", "dublincore"}
def __init__(
self,
name: str = "schema_extract",
description: str = "从网页 HTML 中提取结构化数据JSON-LD、Microdata、RDFa 等)",
input_schema: dict[str, Any] | None = None,
output_schema: dict[str, Any] | None = None,
version: str = "1.0.0",
tags: list[str] | None = None,
):
super().__init__(
name=name,
description=description,
input_schema=input_schema or self._default_input_schema(),
output_schema=output_schema or self._default_output_schema(),
version=version,
tags=tags or ["schema", "extraction"],
)
@staticmethod
def _default_input_schema() -> dict[str, Any]:
return {
"type": "object",
"properties": {
"url_or_html": {
"type": "string",
"description": "要提取的 URL 或原始 HTML 字符串",
},
"formats": {
"type": "array",
"items": {"type": "string"},
"description": "要提取的格式列表",
"default": ["json-ld"],
},
},
"required": ["url_or_html"],
}
@staticmethod
def _default_output_schema() -> dict[str, Any]:
return {
"type": "object",
"properties": {
"schemas": {
"type": "array",
"items": {
"type": "object",
"properties": {
"format": {"type": "string"},
"data": {"type": "object"},
},
},
"description": "提取到的结构化数据列表",
},
"success": {"type": "boolean", "description": "是否成功"},
"error": {"type": "string", "description": "错误信息(仅失败时)"},
},
}
def _is_url(self, text: str) -> bool:
"""判断输入是 URL 还是 HTML"""
return text.strip().startswith(("http://", "https://"))
async def execute(self, **kwargs) -> dict:
"""执行结构化数据提取
Args:
url_or_html: URL 或原始 HTML 字符串必需
formats: 要提取的格式列表默认 ["json-ld"]
可选: "json-ld", "microdata", "rdfa", "dublincore"
Returns:
包含 schemas 列表和 success 布尔值的字典
"""
url_or_html = kwargs.get("url_or_html")
if not url_or_html:
return {"error": "url_or_html 参数是必需的", "schemas": [], "success": False}
formats = kwargs.get("formats", ["json-ld"])
# 验证格式
invalid_formats = set(formats) - self.SUPPORTED_FORMATS
if invalid_formats:
return {
"error": f"不支持的格式: {invalid_formats},支持的格式: {self.SUPPORTED_FORMATS}",
"schemas": [],
"success": False,
}
# 优雅降级extruct 未安装
if not _EXTRUCT_AVAILABLE:
return {
"error": "extruct not installed. Run: pip install extruct",
"schemas": [],
"success": False,
}
try:
html = url_or_html
url = None
# 如果输入是 URL先获取 HTML
if self._is_url(url_or_html):
url = url_or_html
try:
import urllib.request
req = urllib.request.Request(url, headers={"User-Agent": "AgentKit/1.0"})
with urllib.request.urlopen(req, timeout=30) as resp:
html = resp.read().decode("utf-8", errors="replace")
except Exception as e:
return {
"error": f"获取 URL 内容失败: {e}",
"schemas": [],
"success": False,
}
# 使用 extruct 提取
data = extruct.extract(
html,
base_url=url or "",
formats=formats,
)
# 整理结果
schemas: list[dict[str, Any]] = []
for fmt in formats:
items = data.get(fmt, [])
if items:
for item in items:
schemas.append({"format": fmt, "data": item})
return {"schemas": schemas, "success": True}
except Exception as e:
logger.error(f"SchemaExtractTool 提取失败: {e}")
return {
"error": str(e),
"schemas": [],
"success": False,
}
class SchemaGenerateTool(Tool):
"""JSON-LD 结构化数据生成工具 - 为常见 Schema.org 类型生成标记
pydantic-schemaorg 可用时提供验证否则手动构建 JSON-LD
手动生成始终可用无需外部依赖
"""
SUPPORTED_TYPES = {
"Organization",
"WebPage",
"Article",
"Product",
"FAQPage",
"HowTo",
"LocalBusiness",
"Person",
"BreadcrumbList",
"SiteNavigationElement",
}
def __init__(
self,
name: str = "schema_generate",
description: str = "生成 Schema.org JSON-LD 结构化数据标记",
input_schema: dict[str, Any] | None = None,
output_schema: dict[str, Any] | None = None,
version: str = "1.0.0",
tags: list[str] | None = None,
):
super().__init__(
name=name,
description=description,
input_schema=input_schema or self._default_input_schema(),
output_schema=output_schema or self._default_output_schema(),
version=version,
tags=tags or ["schema", "generation"],
)
@staticmethod
def _default_input_schema() -> dict[str, Any]:
return {
"type": "object",
"properties": {
"schema_type": {
"type": "string",
"description": "Schema.org 类型名称,如 Organization、FAQPage 等",
},
"properties": {
"type": "object",
"description": "Schema 属性字典",
},
},
"required": ["schema_type", "properties"],
}
@staticmethod
def _default_output_schema() -> dict[str, Any]:
return {
"type": "object",
"properties": {
"jsonld": {"type": "string", "description": "生成的 JSON-LD 字符串"},
"schema_type": {"type": "string", "description": "Schema 类型"},
"success": {"type": "boolean", "description": "是否成功"},
"error": {"type": "string", "description": "错误信息(仅失败时)"},
},
}
def _generate_manual(self, schema_type: str, properties: dict[str, Any]) -> str:
"""手动构建 JSON-LD无需外部依赖"""
jsonld_obj: dict[str, Any] = {
"@context": "https://schema.org",
"@type": schema_type,
}
jsonld_obj.update(properties)
return json.dumps(jsonld_obj, ensure_ascii=False, indent=2)
def _generate_with_schemaorg(self, schema_type: str, properties: dict[str, Any]) -> str | None:
"""使用 pydantic-schemaorg 生成 JSON-LD带验证"""
if not _PYDANTIC_SCHEMAORG_AVAILABLE:
return None
try:
# 尝试获取对应的 pydantic_schemaorg 类
schema_cls = getattr(pydantic_schemaorg, schema_type, None)
if schema_cls is None:
return None
instance = schema_cls(**properties)
# pydantic_schemaorg 对象转 dict
if hasattr(instance, "model_dump"):
data = instance.model_dump(exclude_none=True)
elif hasattr(instance, "dict"):
data = instance.dict(exclude_none=True)
else:
return None
jsonld_obj: dict[str, Any] = {
"@context": "https://schema.org",
"@type": schema_type,
}
jsonld_obj.update(data)
return json.dumps(jsonld_obj, ensure_ascii=False, indent=2)
except Exception:
return None
async def execute(self, **kwargs) -> dict:
"""执行 JSON-LD 生成
Args:
schema_type: Schema.org 类型名称必需 "Organization"
properties: Schema 属性字典必需
Returns:
包含 jsonld 字符串schema_type success 布尔值的字典
"""
schema_type = kwargs.get("schema_type")
properties = kwargs.get("properties")
if not schema_type:
return {"error": "schema_type 参数是必需的", "schema_type": "", "success": False}
if properties is None:
return {"error": "properties 参数是必需的", "schema_type": schema_type, "success": False}
if not isinstance(properties, dict):
return {
"error": "properties 必须是字典类型",
"schema_type": schema_type,
"success": False,
}
# 验证 schema_type
if schema_type not in self.SUPPORTED_TYPES:
return {
"error": f"不支持的 schema_type: {schema_type},支持的类型: {sorted(self.SUPPORTED_TYPES)}",
"schema_type": schema_type,
"success": False,
}
try:
# 优先尝试使用 pydantic-schemaorg带验证
jsonld = self._generate_with_schemaorg(schema_type, properties)
# 降级到手动生成
if jsonld is None:
jsonld = self._generate_manual(schema_type, properties)
return {
"jsonld": jsonld,
"schema_type": schema_type,
"success": True,
}
except Exception as e:
logger.error(f"SchemaGenerateTool 生成失败: {e}")
return {
"error": str(e),
"schema_type": schema_type,
"success": False,
}

View File

@ -0,0 +1,159 @@
"""WebCrawlTool - 基于 Crawl4AI 的网页抓取工具,支持优雅降级"""
import logging
from typing import Any
from agentkit.tools.base import Tool
logger = logging.getLogger(__name__)
# 检测 Crawl4AI 是否可用
_CRAWL4AI_AVAILABLE = False
AsyncWebCrawler = None
JsonCssExtractionStrategy = None
try:
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
_CRAWL4AI_AVAILABLE = True
except ImportError:
pass
class WebCrawlTool(Tool):
"""网页抓取工具 - 使用 Crawl4AI可选依赖未安装时优雅降级
支持 Markdown/HTML 输出CSS 选择器提取JS 渲染等待
Crawl4AI 未安装时返回包含安装提示的错误信息
"""
def __init__(
self,
name: str = "web_crawl",
description: str = "抓取网页内容,支持 Markdown/HTML 输出和 CSS 选择器提取",
input_schema: dict[str, Any] | None = None,
output_schema: dict[str, Any] | None = None,
version: str = "1.0.0",
tags: list[str] | None = None,
):
super().__init__(
name=name,
description=description,
input_schema=input_schema or self._default_input_schema(),
output_schema=output_schema or self._default_output_schema(),
version=version,
tags=tags or ["web", "crawl"],
)
@staticmethod
def _default_input_schema() -> dict[str, Any]:
return {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "要抓取的 URL",
},
"format": {
"type": "string",
"description": "输出格式markdown 或 html",
"default": "markdown",
"enum": ["markdown", "html"],
},
"css_selector": {
"type": "string",
"description": "可选的 CSS 选择器,用于结构化提取",
},
"js_wait": {
"type": "number",
"description": "等待 JS 渲染的秒数",
"default": 0,
},
},
"required": ["url"],
}
@staticmethod
def _default_output_schema() -> dict[str, Any]:
return {
"type": "object",
"properties": {
"content": {"type": "string", "description": "抓取到的内容"},
"status_code": {"type": "integer", "description": "HTTP 状态码"},
"links": {"type": "array", "items": {"type": "string"}, "description": "页面中的链接"},
"success": {"type": "boolean", "description": "是否成功"},
"error": {"type": "string", "description": "错误信息(仅失败时)"},
},
}
async def execute(self, **kwargs) -> dict:
"""执行网页抓取
Args:
url: 要抓取的 URL必需
format: 输出格式 - "markdown" "html"默认 "markdown"
css_selector: 可选的 CSS 选择器用于结构化提取
js_wait: 等待 JS 渲染的秒数默认 0
Returns:
包含 content, status_code, links, success 的字典
"""
url = kwargs.get("url")
if not url:
return {"error": "url 参数是必需的", "success": False}
output_format = kwargs.get("format", "markdown")
css_selector = kwargs.get("css_selector")
js_wait = kwargs.get("js_wait", 0)
# 优雅降级Crawl4AI 未安装
if not _CRAWL4AI_AVAILABLE:
return {
"error": "Crawl4AI not installed. Run: pip install crawl4ai",
"success": False,
}
try:
extraction_strategy = None
if css_selector:
extraction_strategy = JsonCssExtractionStrategy(css_selector)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url,
extraction_strategy=extraction_strategy,
js_wait=js_wait if js_wait else None,
)
# 提取内容
if output_format == "html":
content = result.html or ""
else:
content = result.markdown or ""
# 提取链接
links: list[str] = []
if hasattr(result, "links") and result.links:
links = result.links if isinstance(result.links, list) else []
status_code = result.status_code if hasattr(result, "status_code") else 200
response: dict[str, Any] = {
"content": content,
"status_code": status_code,
"links": links,
"success": True,
}
# 如果使用了 CSS 选择器提取,附加提取结果
if extraction_strategy and hasattr(result, "extracted_content") and result.extracted_content:
response["extracted"] = result.extracted_content
return response
except Exception as e:
logger.error(f"WebCrawlTool 抓取失败: {url} - {e}")
return {
"error": str(e),
"success": False,
}

View File

@ -0,0 +1,413 @@
"""Schema 工具集单元测试 - SchemaExtractTool + SchemaGenerateTool"""
import json
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from agentkit.tools.schema_tools import SchemaExtractTool, SchemaGenerateTool
# ========== SchemaExtractTool 测试 ==========
class TestSchemaExtractToolConstruction:
"""测试 SchemaExtractTool 构造"""
def test_default_construction(self):
tool = SchemaExtractTool()
assert tool.name == "schema_extract"
assert tool.input_schema is not None
assert tool.output_schema is not None
assert "url_or_html" in tool.input_schema["properties"]
assert tool.input_schema["required"] == ["url_or_html"]
def test_custom_construction(self):
tool = SchemaExtractTool(
name="my_extractor",
description="自定义提取器",
version="2.0.0",
)
assert tool.name == "my_extractor"
def test_supported_formats(self):
tool = SchemaExtractTool()
assert "json-ld" in tool.SUPPORTED_FORMATS
assert "microdata" in tool.SUPPORTED_FORMATS
assert "rdfa" in tool.SUPPORTED_FORMATS
assert "dublincore" in tool.SUPPORTED_FORMATS
def test_to_dict(self):
tool = SchemaExtractTool()
d = tool.to_dict()
assert d["name"] == "schema_extract"
class TestSchemaExtractToolGracefulDegradation:
"""测试 extruct 不可用时的优雅降级"""
@pytest.mark.asyncio
async def test_execute_without_extruct(self):
with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", False):
tool = SchemaExtractTool()
result = await tool.execute(url_or_html="<html></html>")
assert result["success"] is False
assert "extruct not installed" in result["error"]
assert "pip install extruct" in result["error"]
assert result["schemas"] == []
class TestSchemaExtractToolValidation:
"""测试输入验证"""
@pytest.mark.asyncio
async def test_execute_missing_url_or_html(self):
tool = SchemaExtractTool()
result = await tool.execute()
assert result["success"] is False
assert "url_or_html" in result["error"]
@pytest.mark.asyncio
async def test_execute_empty_url_or_html(self):
tool = SchemaExtractTool()
result = await tool.execute(url_or_html="")
assert result["success"] is False
@pytest.mark.asyncio
async def test_execute_invalid_format(self):
with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True):
tool = SchemaExtractTool()
result = await tool.execute(url_or_html="<html></html>", formats=["invalid-format"])
assert result["success"] is False
assert "不支持" in result["error"] or "invalid" in result["error"].lower()
class TestSchemaExtractToolWithMockedExtruct:
"""使用 mock extruct 测试提取逻辑"""
SAMPLE_HTML_WITH_JSONLD = """
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Organization",
"name": "Test Corp"
}
</script>
</head>
<body></body>
</html>
"""
@pytest.mark.asyncio
async def test_extract_jsonld_from_html(self):
"""测试从 HTML 中提取 JSON-LD"""
mock_extruct = MagicMock()
mock_extruct.extract.return_value = {
"json-ld": [
{"@context": "https://schema.org", "@type": "Organization", "name": "Test Corp"}
]
}
with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \
patch("agentkit.tools.schema_tools.extruct", mock_extruct):
tool = SchemaExtractTool()
result = await tool.execute(url_or_html=self.SAMPLE_HTML_WITH_JSONLD)
assert result["success"] is True
assert len(result["schemas"]) == 1
assert result["schemas"][0]["format"] == "json-ld"
assert result["schemas"][0]["data"]["@type"] == "Organization"
assert result["schemas"][0]["data"]["name"] == "Test Corp"
@pytest.mark.asyncio
async def test_extract_no_schema_data(self):
"""测试 HTML 中没有结构化数据"""
mock_extruct = MagicMock()
mock_extruct.extract.return_value = {"json-ld": []}
with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \
patch("agentkit.tools.schema_tools.extruct", mock_extruct):
tool = SchemaExtractTool()
result = await tool.execute(url_or_html="<html><body>No schema</body></html>")
assert result["success"] is True
assert result["schemas"] == []
@pytest.mark.asyncio
async def test_extract_multiple_formats(self):
"""测试同时提取多种格式"""
mock_extruct = MagicMock()
mock_extruct.extract.return_value = {
"json-ld": [{"@type": "Organization", "name": "Corp"}],
"microdata": [{"type": "Product", "name": "Item"}],
}
with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \
patch("agentkit.tools.schema_tools.extruct", mock_extruct):
tool = SchemaExtractTool()
result = await tool.execute(
url_or_html="<html></html>",
formats=["json-ld", "microdata"],
)
assert result["success"] is True
assert len(result["schemas"]) == 2
formats_found = {s["format"] for s in result["schemas"]}
assert "json-ld" in formats_found
assert "microdata" in formats_found
@pytest.mark.asyncio
async def test_extract_error_handling(self):
"""测试提取异常处理"""
mock_extruct = MagicMock()
mock_extruct.extract.side_effect = Exception("Parse error")
with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \
patch("agentkit.tools.schema_tools.extruct", mock_extruct):
tool = SchemaExtractTool()
result = await tool.execute(url_or_html="<html></html>")
assert result["success"] is False
assert "Parse error" in result["error"]
@pytest.mark.asyncio
async def test_extract_with_url(self):
"""测试从 URL 提取(需要先获取 HTML"""
mock_extruct = MagicMock()
mock_extruct.extract.return_value = {
"json-ld": [{"@type": "WebPage"}]
}
with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \
patch("agentkit.tools.schema_tools.extruct", mock_extruct), \
patch("urllib.request.urlopen") as mock_urlopen:
mock_resp = MagicMock()
mock_resp.read.return_value = b"<html><body>Test</body></html>"
mock_resp.__enter__ = MagicMock(return_value=mock_resp)
mock_resp.__exit__ = MagicMock(return_value=None)
mock_urlopen.return_value = mock_resp
tool = SchemaExtractTool()
result = await tool.execute(url_or_html="https://example.com")
assert result["success"] is True
# ========== SchemaGenerateTool 测试 ==========
class TestSchemaGenerateToolConstruction:
"""测试 SchemaGenerateTool 构造"""
def test_default_construction(self):
tool = SchemaGenerateTool()
assert tool.name == "schema_generate"
assert tool.input_schema is not None
assert tool.output_schema is not None
assert "schema_type" in tool.input_schema["properties"]
assert "properties" in tool.input_schema["properties"]
def test_supported_types(self):
tool = SchemaGenerateTool()
assert "Organization" in tool.SUPPORTED_TYPES
assert "FAQPage" in tool.SUPPORTED_TYPES
assert "Article" in tool.SUPPORTED_TYPES
assert "Product" in tool.SUPPORTED_TYPES
assert "HowTo" in tool.SUPPORTED_TYPES
assert "LocalBusiness" in tool.SUPPORTED_TYPES
assert "Person" in tool.SUPPORTED_TYPES
assert "BreadcrumbList" in tool.SUPPORTED_TYPES
assert "SiteNavigationElement" in tool.SUPPORTED_TYPES
assert "WebPage" in tool.SUPPORTED_TYPES
class TestSchemaGenerateToolValidation:
"""测试输入验证"""
@pytest.mark.asyncio
async def test_execute_missing_schema_type(self):
tool = SchemaGenerateTool()
result = await tool.execute(properties={"name": "Test"})
assert result["success"] is False
assert "schema_type" in result["error"]
@pytest.mark.asyncio
async def test_execute_missing_properties(self):
tool = SchemaGenerateTool()
result = await tool.execute(schema_type="Organization")
assert result["success"] is False
assert "properties" in result["error"]
@pytest.mark.asyncio
async def test_execute_invalid_schema_type(self):
tool = SchemaGenerateTool()
result = await tool.execute(schema_type="InvalidType", properties={"name": "Test"})
assert result["success"] is False
assert "不支持" in result["error"] or "InvalidType" in result["error"]
@pytest.mark.asyncio
async def test_execute_properties_not_dict(self):
tool = SchemaGenerateTool()
result = await tool.execute(schema_type="Organization", properties="not a dict")
assert result["success"] is False
assert "字典" in result["error"] or "dict" in result["error"].lower()
class TestSchemaGenerateToolManualGeneration:
"""测试手动 JSON-LD 生成(始终可用,无需外部依赖)"""
@pytest.mark.asyncio
async def test_generate_organization(self):
"""测试生成 Organization 类型"""
with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
tool = SchemaGenerateTool()
result = await tool.execute(
schema_type="Organization",
properties={"name": "Fischer AI", "url": "https://fischer.ai"},
)
assert result["success"] is True
assert result["schema_type"] == "Organization"
jsonld = json.loads(result["jsonld"])
assert jsonld["@context"] == "https://schema.org"
assert jsonld["@type"] == "Organization"
assert jsonld["name"] == "Fischer AI"
assert jsonld["url"] == "https://fischer.ai"
@pytest.mark.asyncio
async def test_generate_faq_page(self):
"""测试生成 FAQPage 类型"""
with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
tool = SchemaGenerateTool()
result = await tool.execute(
schema_type="FAQPage",
properties={
"mainEntity": [
{
"@type": "Question",
"name": "What is GEO?",
"acceptedAnswer": {
"@type": "Answer",
"text": "Generative Engine Optimization",
},
}
]
},
)
assert result["success"] is True
jsonld = json.loads(result["jsonld"])
assert jsonld["@type"] == "FAQPage"
assert len(jsonld["mainEntity"]) == 1
@pytest.mark.asyncio
async def test_generate_article(self):
"""测试生成 Article 类型"""
with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
tool = SchemaGenerateTool()
result = await tool.execute(
schema_type="Article",
properties={
"headline": "Test Article",
"author": {"@type": "Person", "name": "John"},
},
)
assert result["success"] is True
jsonld = json.loads(result["jsonld"])
assert jsonld["@type"] == "Article"
assert jsonld["headline"] == "Test Article"
@pytest.mark.asyncio
async def test_generate_breadcrumb_list(self):
"""测试生成 BreadcrumbList 类型"""
with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
tool = SchemaGenerateTool()
result = await tool.execute(
schema_type="BreadcrumbList",
properties={
"itemListElement": [
{"@type": "ListItem", "position": 1, "name": "Home"},
]
},
)
assert result["success"] is True
jsonld = json.loads(result["jsonld"])
assert jsonld["@type"] == "BreadcrumbList"
@pytest.mark.asyncio
async def test_output_is_valid_jsonld(self):
"""测试输出是有效的 JSON-LD包含 @context 和 @type"""
with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
tool = SchemaGenerateTool()
for schema_type in ["Organization", "WebPage", "Product", "Person"]:
result = await tool.execute(
schema_type=schema_type,
properties={"name": f"Test {schema_type}"},
)
assert result["success"] is True
jsonld = json.loads(result["jsonld"])
assert "@context" in jsonld
assert jsonld["@context"] == "https://schema.org"
assert "@type" in jsonld
assert jsonld["@type"] == schema_type
@pytest.mark.asyncio
async def test_manual_generation_preserves_chinese(self):
"""测试手动生成保留中文字符"""
with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
tool = SchemaGenerateTool()
result = await tool.execute(
schema_type="Organization",
properties={"name": "费舍尔科技", "description": "AI 驱动的企业平台"},
)
assert result["success"] is True
jsonld = json.loads(result["jsonld"])
assert jsonld["name"] == "费舍尔科技"
assert jsonld["description"] == "AI 驱动的企业平台"
class TestSchemaGenerateToolWithPydanticSchemaorg:
"""测试 pydantic-schemaorg 可用时的行为"""
@pytest.mark.asyncio
async def test_fallback_to_manual_when_schemaorg_fails(self):
"""当 pydantic-schemaorg 构建失败时,降级到手动生成"""
mock_schemaorg = MagicMock()
# 让 getattr 返回 None模拟类型不存在
mock_schemaorg.Organization = None
with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", True), \
patch("agentkit.tools.schema_tools.pydantic_schemaorg", mock_schemaorg):
tool = SchemaGenerateTool()
result = await tool.execute(
schema_type="Organization",
properties={"name": "Test"},
)
# 应该降级到手动生成
assert result["success"] is True
jsonld = json.loads(result["jsonld"])
assert jsonld["@type"] == "Organization"
assert jsonld["name"] == "Test"
@pytest.mark.asyncio
async def test_schemaorg_not_available_uses_manual(self):
"""当 pydantic-schemaorg 不可用时,使用手动生成"""
with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
tool = SchemaGenerateTool()
result = await tool.execute(
schema_type="Organization",
properties={"name": "Manual Corp"},
)
assert result["success"] is True
jsonld = json.loads(result["jsonld"])
assert jsonld["name"] == "Manual Corp"
class TestSchemaGenerateToolSafeExecute:
"""测试 safe_execute 钩子"""
@pytest.mark.asyncio
async def test_safe_execute_success(self):
with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
tool = SchemaGenerateTool()
result = await tool.safe_execute(
schema_type="Organization",
properties={"name": "Test"},
)
assert result["success"] is True

View File

@ -0,0 +1,201 @@
"""WebCrawlTool 单元测试"""
import sys
import types
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from agentkit.tools.web_crawl import WebCrawlTool
class TestWebCrawlToolConstruction:
"""测试 WebCrawlTool 构造"""
def test_default_construction(self):
tool = WebCrawlTool()
assert tool.name == "web_crawl"
assert "抓取" in tool.description or "crawl" in tool.description.lower()
assert tool.input_schema is not None
assert tool.output_schema is not None
assert "url" in tool.input_schema["properties"]
assert tool.input_schema["required"] == ["url"]
def test_custom_construction(self):
tool = WebCrawlTool(
name="my_crawler",
description="自定义爬虫",
version="2.0.0",
tags=["custom"],
)
assert tool.name == "my_crawler"
assert tool.description == "自定义爬虫"
assert tool.version == "2.0.0"
assert tool.tags == ["custom"]
def test_to_dict(self):
tool = WebCrawlTool()
d = tool.to_dict()
assert d["name"] == "web_crawl"
assert "input_schema" in d
assert "output_schema" in d
def test_repr(self):
tool = WebCrawlTool()
r = repr(tool)
assert "WebCrawlTool" in r
assert "web_crawl" in r
class TestWebCrawlToolGracefulDegradation:
"""测试 Crawl4AI 不可用时的优雅降级"""
@pytest.mark.asyncio
async def test_execute_without_crawl4ai(self):
"""当 Crawl4AI 未安装时,返回安装提示"""
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", False):
tool = WebCrawlTool()
result = await tool.execute(url="https://example.com")
assert result["success"] is False
assert "Crawl4AI not installed" in result["error"]
assert "pip install crawl4ai" in result["error"]
@pytest.mark.asyncio
async def test_safe_execute_without_crawl4ai(self):
"""safe_execute 在 Crawl4AI 不可用时也应正常返回"""
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", False):
tool = WebCrawlTool()
result = await tool.safe_execute(url="https://example.com")
assert result["success"] is False
class TestWebCrawlToolValidation:
"""测试输入验证"""
@pytest.mark.asyncio
async def test_execute_missing_url(self):
tool = WebCrawlTool()
result = await tool.execute()
assert result["success"] is False
assert "url" in result["error"]
@pytest.mark.asyncio
async def test_execute_empty_url(self):
tool = WebCrawlTool()
result = await tool.execute(url="")
assert result["success"] is False
class TestWebCrawlToolWithMockedCrawl4AI:
"""使用 mock Crawl4AI 测试正常抓取逻辑"""
def _make_mock_crawler(self, markdown="# Hello", html="<h1>Hello</h1>", links=None, status_code=200):
"""创建 mock AsyncWebCrawler"""
mock_result = MagicMock()
mock_result.markdown = markdown
mock_result.html = html
mock_result.links = links or ["https://example.com/page1"]
mock_result.status_code = status_code
mock_result.extracted_content = None
mock_crawler = AsyncMock()
mock_crawler.arun = AsyncMock(return_value=mock_result)
mock_crawler.__aenter__ = AsyncMock(return_value=mock_crawler)
mock_crawler.__aexit__ = AsyncMock(return_value=None)
return mock_crawler, mock_result
@pytest.mark.asyncio
async def test_execute_markdown_format(self):
"""测试 Markdown 格式输出"""
mock_crawler, _ = self._make_mock_crawler(markdown="# Test Page")
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
tool = WebCrawlTool()
result = await tool.execute(url="https://example.com", format="markdown")
assert result["success"] is True
assert result["content"] == "# Test Page"
assert result["status_code"] == 200
@pytest.mark.asyncio
async def test_execute_html_format(self):
"""测试 HTML 格式输出"""
mock_crawler, _ = self._make_mock_crawler(html="<h1>Test</h1>")
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
tool = WebCrawlTool()
result = await tool.execute(url="https://example.com", format="html")
assert result["success"] is True
assert result["content"] == "<h1>Test</h1>"
@pytest.mark.asyncio
async def test_execute_with_links(self):
"""测试链接提取"""
mock_crawler, _ = self._make_mock_crawler(links=["https://example.com/a", "https://example.com/b"])
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
tool = WebCrawlTool()
result = await tool.execute(url="https://example.com")
assert result["success"] is True
assert len(result["links"]) == 2
@pytest.mark.asyncio
async def test_execute_with_css_selector(self):
"""测试 CSS 选择器提取"""
mock_crawler, mock_result = self._make_mock_crawler()
mock_result.extracted_content = '{"title": "Test"}'
mock_strategy_cls = MagicMock(return_value=MagicMock())
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler), \
patch("agentkit.tools.web_crawl.JsonCssExtractionStrategy", mock_strategy_cls):
tool = WebCrawlTool()
result = await tool.execute(url="https://example.com", css_selector="h1")
assert result["success"] is True
assert "extracted" in result
mock_strategy_cls.assert_called_once_with("h1")
@pytest.mark.asyncio
async def test_execute_with_js_wait(self):
"""测试 JS 等待参数"""
mock_crawler, _ = self._make_mock_crawler()
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
tool = WebCrawlTool()
result = await tool.execute(url="https://example.com", js_wait=2)
assert result["success"] is True
# 验证 arun 被调用时传入了 js_wait 参数
call_kwargs = mock_crawler.arun.call_args
assert call_kwargs[1].get("js_wait") == 2 or call_kwargs[1].get("js_wait") is not None
@pytest.mark.asyncio
async def test_execute_crawl_error(self):
"""测试抓取异常处理"""
mock_crawler = AsyncMock()
mock_crawler.arun = AsyncMock(side_effect=Exception("Connection timeout"))
mock_crawler.__aenter__ = AsyncMock(return_value=mock_crawler)
mock_crawler.__aexit__ = AsyncMock(return_value=None)
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
tool = WebCrawlTool()
result = await tool.execute(url="https://example.com")
assert result["success"] is False
assert "Connection timeout" in result["error"]
@pytest.mark.asyncio
async def test_execute_default_format_is_markdown(self):
"""测试默认输出格式为 markdown"""
mock_crawler, _ = self._make_mock_crawler(markdown="MD content", html="HTML content")
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
tool = WebCrawlTool()
result = await tool.execute(url="https://example.com")
assert result["success"] is True
assert result["content"] == "MD content"