feat(tools): U3 built-in Python tools - WebCrawl, SchemaExtract, SchemaGenerate

Add WebCrawlTool (Crawl4AI wrapper with graceful degradation), SchemaExtractTool (extruct-based Schema.org extraction), and SchemaGenerateTool (JSON-LD generation with optional pydantic-schemaorg validation). All tools work without optional dependencies.
2026-06-07 17:25:24 +08:00 · 2026-06-07 17:25:24 +08:00 · 9ec1740047
parent 550d29a139
commit 9ec1740047
5 changed files with 1124 additions and 0 deletions
--- a/src/agentkit/tools/init.py
+++ b/src/agentkit/tools/init.py
@ -6,6 +6,9 @@ from agentkit.tools.agent_tool import AgentTool
 from agentkit.tools.mcp_tool import MCPTool
 from agentkit.tools.registry import ToolRegistry
 from agentkit.tools.composition import SequentialChain, ParallelFanOut, DynamicSelector
+from agentkit.tools.web_crawl import WebCrawlTool
+from agentkit.tools.schema_tools import SchemaExtractTool, SchemaGenerateTool
+from agentkit.tools.baidu_search import BaiduSearchTool

 __all__ = [
    "Tool",
@ -16,4 +19,8 @@ __all__ = [
    "SequentialChain",
    "ParallelFanOut",
    "DynamicSelector",
+    "WebCrawlTool",
+    "SchemaExtractTool",
+    "SchemaGenerateTool",
+    "BaiduSearchTool",
 ]
--- a/src/agentkit/tools/schema_tools.py
+++ b/src/agentkit/tools/schema_tools.py
@ -0,0 +1,344 @@
+"""Schema 工具集 - 结构化数据提取与生成
+
+SchemaExtractTool: 从 HTML 中提取 JSON-LD / Microdata / RDFa 等结构化数据
+SchemaGenerateTool: 生成 Schema.org JSON-LD 标记
+"""
+
+import json
+import logging
+from typing import Any
+
+from agentkit.tools.base import Tool
+
+logger = logging.getLogger(__name__)
+
+# 检测 extruct 是否可用
+_EXTRUCT_AVAILABLE = False
+extruct = None
+try:
+    import extruct
+
+    _EXTRUCT_AVAILABLE = True
+except ImportError:
+    pass
+
+# 检测 pydantic_schemaorg 是否可用
+_PYDANTIC_SCHEMAORG_AVAILABLE = False
+pydantic_schemaorg = None
+try:
+    import pydantic_schemaorg
+
+    _PYDANTIC_SCHEMAORG_AVAILABLE = True
+except ImportError:
+    pass
+
+
+class SchemaExtractTool(Tool):
+    """结构化数据提取工具 - 从 HTML 中提取 JSON-LD、Microdata、RDFa 等
+
+    使用 extruct 库进行提取，当 extruct 未安装时优雅降级。
+    """
+
+    SUPPORTED_FORMATS = {"json-ld", "microdata", "rdfa", "dublincore"}
+
+    def __init__(
+        self,
+        name: str = "schema_extract",
+        description: str = "从网页 HTML 中提取结构化数据（JSON-LD、Microdata、RDFa 等）",
+        input_schema: dict[str, Any] | None = None,
+        output_schema: dict[str, Any] | None = None,
+        version: str = "1.0.0",
+        tags: list[str] | None = None,
+    ):
+        super().__init__(
+            name=name,
+            description=description,
+            input_schema=input_schema or self._default_input_schema(),
+            output_schema=output_schema or self._default_output_schema(),
+            version=version,
+            tags=tags or ["schema", "extraction"],
+        )
+
+    @staticmethod
+    def _default_input_schema() -> dict[str, Any]:
+        return {
+            "type": "object",
+            "properties": {
+                "url_or_html": {
+                    "type": "string",
+                    "description": "要提取的 URL 或原始 HTML 字符串",
+                },
+                "formats": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "要提取的格式列表",
+                    "default": ["json-ld"],
+                },
+            },
+            "required": ["url_or_html"],
+        }
+
+    @staticmethod
+    def _default_output_schema() -> dict[str, Any]:
+        return {
+            "type": "object",
+            "properties": {
+                "schemas": {
+                    "type": "array",
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "format": {"type": "string"},
+                            "data": {"type": "object"},
+                        },
+                    },
+                    "description": "提取到的结构化数据列表",
+                },
+                "success": {"type": "boolean", "description": "是否成功"},
+                "error": {"type": "string", "description": "错误信息（仅失败时）"},
+            },
+        }
+
+    def _is_url(self, text: str) -> bool:
+        """判断输入是 URL 还是 HTML"""
+        return text.strip().startswith(("http://", "https://"))
+
+    async def execute(self, **kwargs) -> dict:
+        """执行结构化数据提取
+
+        Args:
+            url_or_html: URL 或原始 HTML 字符串（必需）
+            formats: 要提取的格式列表（默认 ["json-ld"]）
+                     可选: "json-ld", "microdata", "rdfa", "dublincore"
+
+        Returns:
+            包含 schemas 列表和 success 布尔值的字典
+        """
+        url_or_html = kwargs.get("url_or_html")
+        if not url_or_html:
+            return {"error": "url_or_html 参数是必需的", "schemas": [], "success": False}
+
+        formats = kwargs.get("formats", ["json-ld"])
+        # 验证格式
+        invalid_formats = set(formats) - self.SUPPORTED_FORMATS
+        if invalid_formats:
+            return {
+                "error": f"不支持的格式: {invalid_formats}，支持的格式: {self.SUPPORTED_FORMATS}",
+                "schemas": [],
+                "success": False,
+            }
+
+        # 优雅降级：extruct 未安装
+        if not _EXTRUCT_AVAILABLE:
+            return {
+                "error": "extruct not installed. Run: pip install extruct",
+                "schemas": [],
+                "success": False,
+            }
+
+        try:
+            html = url_or_html
+            url = None
+
+            # 如果输入是 URL，先获取 HTML
+            if self._is_url(url_or_html):
+                url = url_or_html
+                try:
+                    import urllib.request
+
+                    req = urllib.request.Request(url, headers={"User-Agent": "AgentKit/1.0"})
+                    with urllib.request.urlopen(req, timeout=30) as resp:
+                        html = resp.read().decode("utf-8", errors="replace")
+                except Exception as e:
+                    return {
+                        "error": f"获取 URL 内容失败: {e}",
+                        "schemas": [],
+                        "success": False,
+                    }
+
+            # 使用 extruct 提取
+            data = extruct.extract(
+                html,
+                base_url=url or "",
+                formats=formats,
+            )
+
+            # 整理结果
+            schemas: list[dict[str, Any]] = []
+            for fmt in formats:
+                items = data.get(fmt, [])
+                if items:
+                    for item in items:
+                        schemas.append({"format": fmt, "data": item})
+
+            return {"schemas": schemas, "success": True}
+
+        except Exception as e:
+            logger.error(f"SchemaExtractTool 提取失败: {e}")
+            return {
+                "error": str(e),
+                "schemas": [],
+                "success": False,
+            }
+
+
+class SchemaGenerateTool(Tool):
+    """JSON-LD 结构化数据生成工具 - 为常见 Schema.org 类型生成标记
+
+    当 pydantic-schemaorg 可用时提供验证，否则手动构建 JSON-LD。
+    手动生成始终可用，无需外部依赖。
+    """
+
+    SUPPORTED_TYPES = {
+        "Organization",
+        "WebPage",
+        "Article",
+        "Product",
+        "FAQPage",
+        "HowTo",
+        "LocalBusiness",
+        "Person",
+        "BreadcrumbList",
+        "SiteNavigationElement",
+    }
+
+    def __init__(
+        self,
+        name: str = "schema_generate",
+        description: str = "生成 Schema.org JSON-LD 结构化数据标记",
+        input_schema: dict[str, Any] | None = None,
+        output_schema: dict[str, Any] | None = None,
+        version: str = "1.0.0",
+        tags: list[str] | None = None,
+    ):
+        super().__init__(
+            name=name,
+            description=description,
+            input_schema=input_schema or self._default_input_schema(),
+            output_schema=output_schema or self._default_output_schema(),
+            version=version,
+            tags=tags or ["schema", "generation"],
+        )
+
+    @staticmethod
+    def _default_input_schema() -> dict[str, Any]:
+        return {
+            "type": "object",
+            "properties": {
+                "schema_type": {
+                    "type": "string",
+                    "description": "Schema.org 类型名称，如 Organization、FAQPage 等",
+                },
+                "properties": {
+                    "type": "object",
+                    "description": "Schema 属性字典",
+                },
+            },
+            "required": ["schema_type", "properties"],
+        }
+
+    @staticmethod
+    def _default_output_schema() -> dict[str, Any]:
+        return {
+            "type": "object",
+            "properties": {
+                "jsonld": {"type": "string", "description": "生成的 JSON-LD 字符串"},
+                "schema_type": {"type": "string", "description": "Schema 类型"},
+                "success": {"type": "boolean", "description": "是否成功"},
+                "error": {"type": "string", "description": "错误信息（仅失败时）"},
+            },
+        }
+
+    def _generate_manual(self, schema_type: str, properties: dict[str, Any]) -> str:
+        """手动构建 JSON-LD（无需外部依赖）"""
+        jsonld_obj: dict[str, Any] = {
+            "@context": "https://schema.org",
+            "@type": schema_type,
+        }
+        jsonld_obj.update(properties)
+        return json.dumps(jsonld_obj, ensure_ascii=False, indent=2)
+
+    def _generate_with_schemaorg(self, schema_type: str, properties: dict[str, Any]) -> str | None:
+        """使用 pydantic-schemaorg 生成 JSON-LD（带验证）"""
+        if not _PYDANTIC_SCHEMAORG_AVAILABLE:
+            return None
+
+        try:
+            # 尝试获取对应的 pydantic_schemaorg 类
+            schema_cls = getattr(pydantic_schemaorg, schema_type, None)
+            if schema_cls is None:
+                return None
+
+            instance = schema_cls(**properties)
+            # pydantic_schemaorg 对象转 dict
+            if hasattr(instance, "model_dump"):
+                data = instance.model_dump(exclude_none=True)
+            elif hasattr(instance, "dict"):
+                data = instance.dict(exclude_none=True)
+            else:
+                return None
+
+            jsonld_obj: dict[str, Any] = {
+                "@context": "https://schema.org",
+                "@type": schema_type,
+            }
+            jsonld_obj.update(data)
+            return json.dumps(jsonld_obj, ensure_ascii=False, indent=2)
+        except Exception:
+            return None
+
+    async def execute(self, **kwargs) -> dict:
+        """执行 JSON-LD 生成
+
+        Args:
+            schema_type: Schema.org 类型名称（必需，如 "Organization"）
+            properties: Schema 属性字典（必需）
+
+        Returns:
+            包含 jsonld 字符串、schema_type 和 success 布尔值的字典
+        """
+        schema_type = kwargs.get("schema_type")
+        properties = kwargs.get("properties")
+
+        if not schema_type:
+            return {"error": "schema_type 参数是必需的", "schema_type": "", "success": False}
+
+        if properties is None:
+            return {"error": "properties 参数是必需的", "schema_type": schema_type, "success": False}
+
+        if not isinstance(properties, dict):
+            return {
+                "error": "properties 必须是字典类型",
+                "schema_type": schema_type,
+                "success": False,
+            }
+
+        # 验证 schema_type
+        if schema_type not in self.SUPPORTED_TYPES:
+            return {
+                "error": f"不支持的 schema_type: {schema_type}，支持的类型: {sorted(self.SUPPORTED_TYPES)}",
+                "schema_type": schema_type,
+                "success": False,
+            }
+
+        try:
+            # 优先尝试使用 pydantic-schemaorg（带验证）
+            jsonld = self._generate_with_schemaorg(schema_type, properties)
+
+            # 降级到手动生成
+            if jsonld is None:
+                jsonld = self._generate_manual(schema_type, properties)
+
+            return {
+                "jsonld": jsonld,
+                "schema_type": schema_type,
+                "success": True,
+            }
+
+        except Exception as e:
+            logger.error(f"SchemaGenerateTool 生成失败: {e}")
+            return {
+                "error": str(e),
+                "schema_type": schema_type,
+                "success": False,
+            }
--- a/src/agentkit/tools/web_crawl.py
+++ b/src/agentkit/tools/web_crawl.py
@ -0,0 +1,159 @@
+"""WebCrawlTool - 基于 Crawl4AI 的网页抓取工具，支持优雅降级"""
+
+import logging
+from typing import Any
+
+from agentkit.tools.base import Tool
+
+logger = logging.getLogger(__name__)
+
+# 检测 Crawl4AI 是否可用
+_CRAWL4AI_AVAILABLE = False
+AsyncWebCrawler = None
+JsonCssExtractionStrategy = None
+try:
+    from crawl4ai import AsyncWebCrawler
+    from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+
+    _CRAWL4AI_AVAILABLE = True
+except ImportError:
+    pass
+
+
+class WebCrawlTool(Tool):
+    """网页抓取工具 - 使用 Crawl4AI，可选依赖未安装时优雅降级
+
+    支持 Markdown/HTML 输出、CSS 选择器提取、JS 渲染等待。
+    当 Crawl4AI 未安装时，返回包含安装提示的错误信息。
+    """
+
+    def __init__(
+        self,
+        name: str = "web_crawl",
+        description: str = "抓取网页内容，支持 Markdown/HTML 输出和 CSS 选择器提取",
+        input_schema: dict[str, Any] | None = None,
+        output_schema: dict[str, Any] | None = None,
+        version: str = "1.0.0",
+        tags: list[str] | None = None,
+    ):
+        super().__init__(
+            name=name,
+            description=description,
+            input_schema=input_schema or self._default_input_schema(),
+            output_schema=output_schema or self._default_output_schema(),
+            version=version,
+            tags=tags or ["web", "crawl"],
+        )
+
+    @staticmethod
+    def _default_input_schema() -> dict[str, Any]:
+        return {
+            "type": "object",
+            "properties": {
+                "url": {
+                    "type": "string",
+                    "description": "要抓取的 URL",
+                },
+                "format": {
+                    "type": "string",
+                    "description": "输出格式：markdown 或 html",
+                    "default": "markdown",
+                    "enum": ["markdown", "html"],
+                },
+                "css_selector": {
+                    "type": "string",
+                    "description": "可选的 CSS 选择器，用于结构化提取",
+                },
+                "js_wait": {
+                    "type": "number",
+                    "description": "等待 JS 渲染的秒数",
+                    "default": 0,
+                },
+            },
+            "required": ["url"],
+        }
+
+    @staticmethod
+    def _default_output_schema() -> dict[str, Any]:
+        return {
+            "type": "object",
+            "properties": {
+                "content": {"type": "string", "description": "抓取到的内容"},
+                "status_code": {"type": "integer", "description": "HTTP 状态码"},
+                "links": {"type": "array", "items": {"type": "string"}, "description": "页面中的链接"},
+                "success": {"type": "boolean", "description": "是否成功"},
+                "error": {"type": "string", "description": "错误信息（仅失败时）"},
+            },
+        }
+
+    async def execute(self, **kwargs) -> dict:
+        """执行网页抓取
+
+        Args:
+            url: 要抓取的 URL（必需）
+            format: 输出格式 - "markdown" 或 "html"（默认 "markdown"）
+            css_selector: 可选的 CSS 选择器，用于结构化提取
+            js_wait: 等待 JS 渲染的秒数（默认 0）
+
+        Returns:
+            包含 content, status_code, links, success 的字典
+        """
+        url = kwargs.get("url")
+        if not url:
+            return {"error": "url 参数是必需的", "success": False}
+
+        output_format = kwargs.get("format", "markdown")
+        css_selector = kwargs.get("css_selector")
+        js_wait = kwargs.get("js_wait", 0)
+
+        # 优雅降级：Crawl4AI 未安装
+        if not _CRAWL4AI_AVAILABLE:
+            return {
+                "error": "Crawl4AI not installed. Run: pip install crawl4ai",
+                "success": False,
+            }
+
+        try:
+            extraction_strategy = None
+            if css_selector:
+                extraction_strategy = JsonCssExtractionStrategy(css_selector)
+
+            async with AsyncWebCrawler() as crawler:
+                result = await crawler.arun(
+                    url=url,
+                    extraction_strategy=extraction_strategy,
+                    js_wait=js_wait if js_wait else None,
+                )
+
+            # 提取内容
+            if output_format == "html":
+                content = result.html or ""
+            else:
+                content = result.markdown or ""
+
+            # 提取链接
+            links: list[str] = []
+            if hasattr(result, "links") and result.links:
+                links = result.links if isinstance(result.links, list) else []
+
+            status_code = result.status_code if hasattr(result, "status_code") else 200
+
+            response: dict[str, Any] = {
+                "content": content,
+                "status_code": status_code,
+                "links": links,
+                "success": True,
+            }
+
+            # 如果使用了 CSS 选择器提取，附加提取结果
+            if extraction_strategy and hasattr(result, "extracted_content") and result.extracted_content:
+                response["extracted"] = result.extracted_content
+
+            return response
+
+        except Exception as e:
+            logger.error(f"WebCrawlTool 抓取失败: {url} - {e}")
+            return {
+                "error": str(e),
+                "success": False,
+            }
--- a/tests/unit/test_schema_tools.py
+++ b/tests/unit/test_schema_tools.py
@ -0,0 +1,413 @@
+"""Schema 工具集单元测试 - SchemaExtractTool + SchemaGenerateTool"""
+
+import json
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from agentkit.tools.schema_tools import SchemaExtractTool, SchemaGenerateTool
+
+
+# ========== SchemaExtractTool 测试 ==========
+
+
+class TestSchemaExtractToolConstruction:
+    """测试 SchemaExtractTool 构造"""
+
+    def test_default_construction(self):
+        tool = SchemaExtractTool()
+        assert tool.name == "schema_extract"
+        assert tool.input_schema is not None
+        assert tool.output_schema is not None
+        assert "url_or_html" in tool.input_schema["properties"]
+        assert tool.input_schema["required"] == ["url_or_html"]
+
+    def test_custom_construction(self):
+        tool = SchemaExtractTool(
+            name="my_extractor",
+            description="自定义提取器",
+            version="2.0.0",
+        )
+        assert tool.name == "my_extractor"
+
+    def test_supported_formats(self):
+        tool = SchemaExtractTool()
+        assert "json-ld" in tool.SUPPORTED_FORMATS
+        assert "microdata" in tool.SUPPORTED_FORMATS
+        assert "rdfa" in tool.SUPPORTED_FORMATS
+        assert "dublincore" in tool.SUPPORTED_FORMATS
+
+    def test_to_dict(self):
+        tool = SchemaExtractTool()
+        d = tool.to_dict()
+        assert d["name"] == "schema_extract"
+
+
+class TestSchemaExtractToolGracefulDegradation:
+    """测试 extruct 不可用时的优雅降级"""
+
+    @pytest.mark.asyncio
+    async def test_execute_without_extruct(self):
+        with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", False):
+            tool = SchemaExtractTool()
+            result = await tool.execute(url_or_html="<html></html>")
+            assert result["success"] is False
+            assert "extruct not installed" in result["error"]
+            assert "pip install extruct" in result["error"]
+            assert result["schemas"] == []
+
+
+class TestSchemaExtractToolValidation:
+    """测试输入验证"""
+
+    @pytest.mark.asyncio
+    async def test_execute_missing_url_or_html(self):
+        tool = SchemaExtractTool()
+        result = await tool.execute()
+        assert result["success"] is False
+        assert "url_or_html" in result["error"]
+
+    @pytest.mark.asyncio
+    async def test_execute_empty_url_or_html(self):
+        tool = SchemaExtractTool()
+        result = await tool.execute(url_or_html="")
+        assert result["success"] is False
+
+    @pytest.mark.asyncio
+    async def test_execute_invalid_format(self):
+        with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True):
+            tool = SchemaExtractTool()
+            result = await tool.execute(url_or_html="<html></html>", formats=["invalid-format"])
+            assert result["success"] is False
+            assert "不支持" in result["error"] or "invalid" in result["error"].lower()
+
+
+class TestSchemaExtractToolWithMockedExtruct:
+    """使用 mock extruct 测试提取逻辑"""
+
+    SAMPLE_HTML_WITH_JSONLD = """
+    <html>
+    <head>
+        <script type="application/ld+json">
+        {
+            "@context": "https://schema.org",
+            "@type": "Organization",
+            "name": "Test Corp"
+        }
+        </script>
+    </head>
+    <body></body>
+    </html>
+    """
+
+    @pytest.mark.asyncio
+    async def test_extract_jsonld_from_html(self):
+        """测试从 HTML 中提取 JSON-LD"""
+        mock_extruct = MagicMock()
+        mock_extruct.extract.return_value = {
+            "json-ld": [
+                {"@context": "https://schema.org", "@type": "Organization", "name": "Test Corp"}
+            ]
+        }
+
+        with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \
+             patch("agentkit.tools.schema_tools.extruct", mock_extruct):
+            tool = SchemaExtractTool()
+            result = await tool.execute(url_or_html=self.SAMPLE_HTML_WITH_JSONLD)
+            assert result["success"] is True
+            assert len(result["schemas"]) == 1
+            assert result["schemas"][0]["format"] == "json-ld"
+            assert result["schemas"][0]["data"]["@type"] == "Organization"
+            assert result["schemas"][0]["data"]["name"] == "Test Corp"
+
+    @pytest.mark.asyncio
+    async def test_extract_no_schema_data(self):
+        """测试 HTML 中没有结构化数据"""
+        mock_extruct = MagicMock()
+        mock_extruct.extract.return_value = {"json-ld": []}
+
+        with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \
+             patch("agentkit.tools.schema_tools.extruct", mock_extruct):
+            tool = SchemaExtractTool()
+            result = await tool.execute(url_or_html="<html><body>No schema</body></html>")
+            assert result["success"] is True
+            assert result["schemas"] == []
+
+    @pytest.mark.asyncio
+    async def test_extract_multiple_formats(self):
+        """测试同时提取多种格式"""
+        mock_extruct = MagicMock()
+        mock_extruct.extract.return_value = {
+            "json-ld": [{"@type": "Organization", "name": "Corp"}],
+            "microdata": [{"type": "Product", "name": "Item"}],
+        }
+
+        with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \
+             patch("agentkit.tools.schema_tools.extruct", mock_extruct):
+            tool = SchemaExtractTool()
+            result = await tool.execute(
+                url_or_html="<html></html>",
+                formats=["json-ld", "microdata"],
+            )
+            assert result["success"] is True
+            assert len(result["schemas"]) == 2
+            formats_found = {s["format"] for s in result["schemas"]}
+            assert "json-ld" in formats_found
+            assert "microdata" in formats_found
+
+    @pytest.mark.asyncio
+    async def test_extract_error_handling(self):
+        """测试提取异常处理"""
+        mock_extruct = MagicMock()
+        mock_extruct.extract.side_effect = Exception("Parse error")
+
+        with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \
+             patch("agentkit.tools.schema_tools.extruct", mock_extruct):
+            tool = SchemaExtractTool()
+            result = await tool.execute(url_or_html="<html></html>")
+            assert result["success"] is False
+            assert "Parse error" in result["error"]
+
+    @pytest.mark.asyncio
+    async def test_extract_with_url(self):
+        """测试从 URL 提取（需要先获取 HTML）"""
+        mock_extruct = MagicMock()
+        mock_extruct.extract.return_value = {
+            "json-ld": [{"@type": "WebPage"}]
+        }
+
+        with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \
+             patch("agentkit.tools.schema_tools.extruct", mock_extruct), \
+             patch("urllib.request.urlopen") as mock_urlopen:
+            mock_resp = MagicMock()
+            mock_resp.read.return_value = b"<html><body>Test</body></html>"
+            mock_resp.__enter__ = MagicMock(return_value=mock_resp)
+            mock_resp.__exit__ = MagicMock(return_value=None)
+            mock_urlopen.return_value = mock_resp
+
+            tool = SchemaExtractTool()
+            result = await tool.execute(url_or_html="https://example.com")
+            assert result["success"] is True
+
+
+# ========== SchemaGenerateTool 测试 ==========
+
+
+class TestSchemaGenerateToolConstruction:
+    """测试 SchemaGenerateTool 构造"""
+
+    def test_default_construction(self):
+        tool = SchemaGenerateTool()
+        assert tool.name == "schema_generate"
+        assert tool.input_schema is not None
+        assert tool.output_schema is not None
+        assert "schema_type" in tool.input_schema["properties"]
+        assert "properties" in tool.input_schema["properties"]
+
+    def test_supported_types(self):
+        tool = SchemaGenerateTool()
+        assert "Organization" in tool.SUPPORTED_TYPES
+        assert "FAQPage" in tool.SUPPORTED_TYPES
+        assert "Article" in tool.SUPPORTED_TYPES
+        assert "Product" in tool.SUPPORTED_TYPES
+        assert "HowTo" in tool.SUPPORTED_TYPES
+        assert "LocalBusiness" in tool.SUPPORTED_TYPES
+        assert "Person" in tool.SUPPORTED_TYPES
+        assert "BreadcrumbList" in tool.SUPPORTED_TYPES
+        assert "SiteNavigationElement" in tool.SUPPORTED_TYPES
+        assert "WebPage" in tool.SUPPORTED_TYPES
+
+
+class TestSchemaGenerateToolValidation:
+    """测试输入验证"""
+
+    @pytest.mark.asyncio
+    async def test_execute_missing_schema_type(self):
+        tool = SchemaGenerateTool()
+        result = await tool.execute(properties={"name": "Test"})
+        assert result["success"] is False
+        assert "schema_type" in result["error"]
+
+    @pytest.mark.asyncio
+    async def test_execute_missing_properties(self):
+        tool = SchemaGenerateTool()
+        result = await tool.execute(schema_type="Organization")
+        assert result["success"] is False
+        assert "properties" in result["error"]
+
+    @pytest.mark.asyncio
+    async def test_execute_invalid_schema_type(self):
+        tool = SchemaGenerateTool()
+        result = await tool.execute(schema_type="InvalidType", properties={"name": "Test"})
+        assert result["success"] is False
+        assert "不支持" in result["error"] or "InvalidType" in result["error"]
+
+    @pytest.mark.asyncio
+    async def test_execute_properties_not_dict(self):
+        tool = SchemaGenerateTool()
+        result = await tool.execute(schema_type="Organization", properties="not a dict")
+        assert result["success"] is False
+        assert "字典" in result["error"] or "dict" in result["error"].lower()
+
+
+class TestSchemaGenerateToolManualGeneration:
+    """测试手动 JSON-LD 生成（始终可用，无需外部依赖）"""
+
+    @pytest.mark.asyncio
+    async def test_generate_organization(self):
+        """测试生成 Organization 类型"""
+        with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
+            tool = SchemaGenerateTool()
+            result = await tool.execute(
+                schema_type="Organization",
+                properties={"name": "Fischer AI", "url": "https://fischer.ai"},
+            )
+            assert result["success"] is True
+            assert result["schema_type"] == "Organization"
+
+            jsonld = json.loads(result["jsonld"])
+            assert jsonld["@context"] == "https://schema.org"
+            assert jsonld["@type"] == "Organization"
+            assert jsonld["name"] == "Fischer AI"
+            assert jsonld["url"] == "https://fischer.ai"
+
+    @pytest.mark.asyncio
+    async def test_generate_faq_page(self):
+        """测试生成 FAQPage 类型"""
+        with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
+            tool = SchemaGenerateTool()
+            result = await tool.execute(
+                schema_type="FAQPage",
+                properties={
+                    "mainEntity": [
+                        {
+                            "@type": "Question",
+                            "name": "What is GEO?",
+                            "acceptedAnswer": {
+                                "@type": "Answer",
+                                "text": "Generative Engine Optimization",
+                            },
+                        }
+                    ]
+                },
+            )
+            assert result["success"] is True
+            jsonld = json.loads(result["jsonld"])
+            assert jsonld["@type"] == "FAQPage"
+            assert len(jsonld["mainEntity"]) == 1
+
+    @pytest.mark.asyncio
+    async def test_generate_article(self):
+        """测试生成 Article 类型"""
+        with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
+            tool = SchemaGenerateTool()
+            result = await tool.execute(
+                schema_type="Article",
+                properties={
+                    "headline": "Test Article",
+                    "author": {"@type": "Person", "name": "John"},
+                },
+            )
+            assert result["success"] is True
+            jsonld = json.loads(result["jsonld"])
+            assert jsonld["@type"] == "Article"
+            assert jsonld["headline"] == "Test Article"
+
+    @pytest.mark.asyncio
+    async def test_generate_breadcrumb_list(self):
+        """测试生成 BreadcrumbList 类型"""
+        with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
+            tool = SchemaGenerateTool()
+            result = await tool.execute(
+                schema_type="BreadcrumbList",
+                properties={
+                    "itemListElement": [
+                        {"@type": "ListItem", "position": 1, "name": "Home"},
+                    ]
+                },
+            )
+            assert result["success"] is True
+            jsonld = json.loads(result["jsonld"])
+            assert jsonld["@type"] == "BreadcrumbList"
+
+    @pytest.mark.asyncio
+    async def test_output_is_valid_jsonld(self):
+        """测试输出是有效的 JSON-LD（包含 @context 和 @type）"""
+        with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
+            tool = SchemaGenerateTool()
+            for schema_type in ["Organization", "WebPage", "Product", "Person"]:
+                result = await tool.execute(
+                    schema_type=schema_type,
+                    properties={"name": f"Test {schema_type}"},
+                )
+                assert result["success"] is True
+                jsonld = json.loads(result["jsonld"])
+                assert "@context" in jsonld
+                assert jsonld["@context"] == "https://schema.org"
+                assert "@type" in jsonld
+                assert jsonld["@type"] == schema_type
+
+    @pytest.mark.asyncio
+    async def test_manual_generation_preserves_chinese(self):
+        """测试手动生成保留中文字符"""
+        with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
+            tool = SchemaGenerateTool()
+            result = await tool.execute(
+                schema_type="Organization",
+                properties={"name": "费舍尔科技", "description": "AI 驱动的企业平台"},
+            )
+            assert result["success"] is True
+            jsonld = json.loads(result["jsonld"])
+            assert jsonld["name"] == "费舍尔科技"
+            assert jsonld["description"] == "AI 驱动的企业平台"
+
+
+class TestSchemaGenerateToolWithPydanticSchemaorg:
+    """测试 pydantic-schemaorg 可用时的行为"""
+
+    @pytest.mark.asyncio
+    async def test_fallback_to_manual_when_schemaorg_fails(self):
+        """当 pydantic-schemaorg 构建失败时，降级到手动生成"""
+        mock_schemaorg = MagicMock()
+        # 让 getattr 返回 None，模拟类型不存在
+        mock_schemaorg.Organization = None
+
+        with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", True), \
+             patch("agentkit.tools.schema_tools.pydantic_schemaorg", mock_schemaorg):
+            tool = SchemaGenerateTool()
+            result = await tool.execute(
+                schema_type="Organization",
+                properties={"name": "Test"},
+            )
+            # 应该降级到手动生成
+            assert result["success"] is True
+            jsonld = json.loads(result["jsonld"])
+            assert jsonld["@type"] == "Organization"
+            assert jsonld["name"] == "Test"
+
+    @pytest.mark.asyncio
+    async def test_schemaorg_not_available_uses_manual(self):
+        """当 pydantic-schemaorg 不可用时，使用手动生成"""
+        with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
+            tool = SchemaGenerateTool()
+            result = await tool.execute(
+                schema_type="Organization",
+                properties={"name": "Manual Corp"},
+            )
+            assert result["success"] is True
+            jsonld = json.loads(result["jsonld"])
+            assert jsonld["name"] == "Manual Corp"
+
+
+class TestSchemaGenerateToolSafeExecute:
+    """测试 safe_execute 钩子"""
+
+    @pytest.mark.asyncio
+    async def test_safe_execute_success(self):
+        with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
+            tool = SchemaGenerateTool()
+            result = await tool.safe_execute(
+                schema_type="Organization",
+                properties={"name": "Test"},
+            )
+            assert result["success"] is True
--- a/tests/unit/test_web_crawl_tool.py
+++ b/tests/unit/test_web_crawl_tool.py
@ -0,0 +1,201 @@
+"""WebCrawlTool 单元测试"""
+
+import sys
+import types
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from agentkit.tools.web_crawl import WebCrawlTool
+
+
+class TestWebCrawlToolConstruction:
+    """测试 WebCrawlTool 构造"""
+
+    def test_default_construction(self):
+        tool = WebCrawlTool()
+        assert tool.name == "web_crawl"
+        assert "抓取" in tool.description or "crawl" in tool.description.lower()
+        assert tool.input_schema is not None
+        assert tool.output_schema is not None
+        assert "url" in tool.input_schema["properties"]
+        assert tool.input_schema["required"] == ["url"]
+
+    def test_custom_construction(self):
+        tool = WebCrawlTool(
+            name="my_crawler",
+            description="自定义爬虫",
+            version="2.0.0",
+            tags=["custom"],
+        )
+        assert tool.name == "my_crawler"
+        assert tool.description == "自定义爬虫"
+        assert tool.version == "2.0.0"
+        assert tool.tags == ["custom"]
+
+    def test_to_dict(self):
+        tool = WebCrawlTool()
+        d = tool.to_dict()
+        assert d["name"] == "web_crawl"
+        assert "input_schema" in d
+        assert "output_schema" in d
+
+    def test_repr(self):
+        tool = WebCrawlTool()
+        r = repr(tool)
+        assert "WebCrawlTool" in r
+        assert "web_crawl" in r
+
+
+class TestWebCrawlToolGracefulDegradation:
+    """测试 Crawl4AI 不可用时的优雅降级"""
+
+    @pytest.mark.asyncio
+    async def test_execute_without_crawl4ai(self):
+        """当 Crawl4AI 未安装时，返回安装提示"""
+        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", False):
+            tool = WebCrawlTool()
+            result = await tool.execute(url="https://example.com")
+            assert result["success"] is False
+            assert "Crawl4AI not installed" in result["error"]
+            assert "pip install crawl4ai" in result["error"]
+
+    @pytest.mark.asyncio
+    async def test_safe_execute_without_crawl4ai(self):
+        """safe_execute 在 Crawl4AI 不可用时也应正常返回"""
+        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", False):
+            tool = WebCrawlTool()
+            result = await tool.safe_execute(url="https://example.com")
+            assert result["success"] is False
+
+
+class TestWebCrawlToolValidation:
+    """测试输入验证"""
+
+    @pytest.mark.asyncio
+    async def test_execute_missing_url(self):
+        tool = WebCrawlTool()
+        result = await tool.execute()
+        assert result["success"] is False
+        assert "url" in result["error"]
+
+    @pytest.mark.asyncio
+    async def test_execute_empty_url(self):
+        tool = WebCrawlTool()
+        result = await tool.execute(url="")
+        assert result["success"] is False
+
+
+class TestWebCrawlToolWithMockedCrawl4AI:
+    """使用 mock Crawl4AI 测试正常抓取逻辑"""
+
+    def _make_mock_crawler(self, markdown="# Hello", html="<h1>Hello</h1>", links=None, status_code=200):
+        """创建 mock AsyncWebCrawler"""
+        mock_result = MagicMock()
+        mock_result.markdown = markdown
+        mock_result.html = html
+        mock_result.links = links or ["https://example.com/page1"]
+        mock_result.status_code = status_code
+        mock_result.extracted_content = None
+
+        mock_crawler = AsyncMock()
+        mock_crawler.arun = AsyncMock(return_value=mock_result)
+        mock_crawler.__aenter__ = AsyncMock(return_value=mock_crawler)
+        mock_crawler.__aexit__ = AsyncMock(return_value=None)
+
+        return mock_crawler, mock_result
+
+    @pytest.mark.asyncio
+    async def test_execute_markdown_format(self):
+        """测试 Markdown 格式输出"""
+        mock_crawler, _ = self._make_mock_crawler(markdown="# Test Page")
+
+        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
+             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
+            tool = WebCrawlTool()
+            result = await tool.execute(url="https://example.com", format="markdown")
+            assert result["success"] is True
+            assert result["content"] == "# Test Page"
+            assert result["status_code"] == 200
+
+    @pytest.mark.asyncio
+    async def test_execute_html_format(self):
+        """测试 HTML 格式输出"""
+        mock_crawler, _ = self._make_mock_crawler(html="<h1>Test</h1>")
+
+        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
+             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
+            tool = WebCrawlTool()
+            result = await tool.execute(url="https://example.com", format="html")
+            assert result["success"] is True
+            assert result["content"] == "<h1>Test</h1>"
+
+    @pytest.mark.asyncio
+    async def test_execute_with_links(self):
+        """测试链接提取"""
+        mock_crawler, _ = self._make_mock_crawler(links=["https://example.com/a", "https://example.com/b"])
+
+        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
+             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
+            tool = WebCrawlTool()
+            result = await tool.execute(url="https://example.com")
+            assert result["success"] is True
+            assert len(result["links"]) == 2
+
+    @pytest.mark.asyncio
+    async def test_execute_with_css_selector(self):
+        """测试 CSS 选择器提取"""
+        mock_crawler, mock_result = self._make_mock_crawler()
+        mock_result.extracted_content = '{"title": "Test"}'
+
+        mock_strategy_cls = MagicMock(return_value=MagicMock())
+
+        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
+             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler), \
+             patch("agentkit.tools.web_crawl.JsonCssExtractionStrategy", mock_strategy_cls):
+            tool = WebCrawlTool()
+            result = await tool.execute(url="https://example.com", css_selector="h1")
+            assert result["success"] is True
+            assert "extracted" in result
+            mock_strategy_cls.assert_called_once_with("h1")
+
+    @pytest.mark.asyncio
+    async def test_execute_with_js_wait(self):
+        """测试 JS 等待参数"""
+        mock_crawler, _ = self._make_mock_crawler()
+
+        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
+             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
+            tool = WebCrawlTool()
+            result = await tool.execute(url="https://example.com", js_wait=2)
+            assert result["success"] is True
+            # 验证 arun 被调用时传入了 js_wait 参数
+            call_kwargs = mock_crawler.arun.call_args
+            assert call_kwargs[1].get("js_wait") == 2 or call_kwargs[1].get("js_wait") is not None
+
+    @pytest.mark.asyncio
+    async def test_execute_crawl_error(self):
+        """测试抓取异常处理"""
+        mock_crawler = AsyncMock()
+        mock_crawler.arun = AsyncMock(side_effect=Exception("Connection timeout"))
+        mock_crawler.__aenter__ = AsyncMock(return_value=mock_crawler)
+        mock_crawler.__aexit__ = AsyncMock(return_value=None)
+
+        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
+             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
+            tool = WebCrawlTool()
+            result = await tool.execute(url="https://example.com")
+            assert result["success"] is False
+            assert "Connection timeout" in result["error"]
+
+    @pytest.mark.asyncio
+    async def test_execute_default_format_is_markdown(self):
+        """测试默认输出格式为 markdown"""
+        mock_crawler, _ = self._make_mock_crawler(markdown="MD content", html="HTML content")
+
+        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
+             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
+            tool = WebCrawlTool()
+            result = await tool.execute(url="https://example.com")
+            assert result["success"] is True
+            assert result["content"] == "MD content"