feat(tools): U3 built-in Python tools - WebCrawl, SchemaExtract, SchemaGenerate

Add WebCrawlTool (Crawl4AI wrapper with graceful degradation), SchemaExtractTool (extruct-based Schema.org extraction), and SchemaGenerateTool (JSON-LD generation with optional pydantic-schemaorg validation). All tools work without optional dependencies.
2026-06-07 17:25:24 +08:00 · 2026-06-07 17:25:24 +08:00 · 9ec1740047
parent 550d29a139
commit 9ec1740047
5 changed files with 1124 additions and 0 deletions
--- a/src/agentkit/tools/init.py
+++ b/src/agentkit/tools/init.py
@ -6,6 +6,9 @@ from agentkit.tools.agent_tool import AgentTool
 from agentkit.tools.mcp_tool import MCPTool
 from agentkit.tools.registry import ToolRegistry
 from agentkit.tools.composition import SequentialChain, ParallelFanOut, DynamicSelector
 from agentkit.tools.web_crawl import WebCrawlTool
 from agentkit.tools.schema_tools import SchemaExtractTool, SchemaGenerateTool
 from agentkit.tools.baidu_search import BaiduSearchTool
 __all__ = [
    "Tool",
@ -16,4 +19,8 @@ __all__ = [
    "SequentialChain",
    "ParallelFanOut",
    "DynamicSelector",
    "WebCrawlTool",
    "SchemaExtractTool",
    "SchemaGenerateTool",
    "BaiduSearchTool",
 ]
--- a/src/agentkit/tools/schema_tools.py
+++ b/src/agentkit/tools/schema_tools.py
@ -0,0 +1,344 @@
 """Schema 工具集 - 结构化数据提取与生成
 SchemaExtractTool: 从 HTML 中提取 JSON-LD / Microdata / RDFa 等结构化数据
 SchemaGenerateTool: 生成 Schema.org JSON-LD 标记
 """
 import json
 import logging
 from typing import Any
 from agentkit.tools.base import Tool
 logger = logging.getLogger(__name__)
 # 检测 extruct 是否可用
 _EXTRUCT_AVAILABLE = False
 extruct = None
 try:
    import extruct
    _EXTRUCT_AVAILABLE = True
 except ImportError:
    pass
 # 检测 pydantic_schemaorg 是否可用
 _PYDANTIC_SCHEMAORG_AVAILABLE = False
 pydantic_schemaorg = None
 try:
    import pydantic_schemaorg
    _PYDANTIC_SCHEMAORG_AVAILABLE = True
 except ImportError:
    pass
 class SchemaExtractTool(Tool):
    """结构化数据提取工具 - 从 HTML 中提取 JSON-LD、Microdata、RDFa 等
    使用 extruct 库进行提取，当 extruct 未安装时优雅降级。
    """
    SUPPORTED_FORMATS = {"json-ld", "microdata", "rdfa", "dublincore"}
    def __init__(
        self,
        name: str = "schema_extract",
        description: str = "从网页 HTML 中提取结构化数据（JSON-LD、Microdata、RDFa 等）",
        input_schema: dict[str, Any] | None = None,
        output_schema: dict[str, Any] | None = None,
        version: str = "1.0.0",
        tags: list[str] | None = None,
    ):
        super().__init__(
            name=name,
            description=description,
            input_schema=input_schema or self._default_input_schema(),
            output_schema=output_schema or self._default_output_schema(),
            version=version,
            tags=tags or ["schema", "extraction"],
        )
    @staticmethod
    def _default_input_schema() -> dict[str, Any]:
        return {
            "type": "object",
            "properties": {
                "url_or_html": {
                    "type": "string",
                    "description": "要提取的 URL 或原始 HTML 字符串",
                },
                "formats": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": "要提取的格式列表",
                    "default": ["json-ld"],
                },
            },
            "required": ["url_or_html"],
        }
    @staticmethod
    def _default_output_schema() -> dict[str, Any]:
        return {
            "type": "object",
            "properties": {
                "schemas": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "format": {"type": "string"},
                            "data": {"type": "object"},
                        },
                    },
                    "description": "提取到的结构化数据列表",
                },
                "success": {"type": "boolean", "description": "是否成功"},
                "error": {"type": "string", "description": "错误信息（仅失败时）"},
            },
        }
    def _is_url(self, text: str) -> bool:
        """判断输入是 URL 还是 HTML"""
        return text.strip().startswith(("http://", "https://"))
    async def execute(self, **kwargs) -> dict:
        """执行结构化数据提取
        Args:
            url_or_html: URL 或原始 HTML 字符串（必需）
            formats: 要提取的格式列表（默认 ["json-ld"]）
                     可选: "json-ld", "microdata", "rdfa", "dublincore"
        Returns:
            包含 schemas 列表和 success 布尔值的字典
        """
        url_or_html = kwargs.get("url_or_html")
        if not url_or_html:
            return {"error": "url_or_html 参数是必需的", "schemas": [], "success": False}
        formats = kwargs.get("formats", ["json-ld"])
        # 验证格式
        invalid_formats = set(formats) - self.SUPPORTED_FORMATS
        if invalid_formats:
            return {
                "error": f"不支持的格式: {invalid_formats}，支持的格式: {self.SUPPORTED_FORMATS}",
                "schemas": [],
                "success": False,
            }
        # 优雅降级：extruct 未安装
        if not _EXTRUCT_AVAILABLE:
            return {
                "error": "extruct not installed. Run: pip install extruct",
                "schemas": [],
                "success": False,
            }
        try:
            html = url_or_html
            url = None
            # 如果输入是 URL，先获取 HTML
            if self._is_url(url_or_html):
                url = url_or_html
                try:
                    import urllib.request
                    req = urllib.request.Request(url, headers={"User-Agent": "AgentKit/1.0"})
                    with urllib.request.urlopen(req, timeout=30) as resp:
                        html = resp.read().decode("utf-8", errors="replace")
                except Exception as e:
                    return {
                        "error": f"获取 URL 内容失败: {e}",
                        "schemas": [],
                        "success": False,
                    }
            # 使用 extruct 提取
            data = extruct.extract(
                html,
                base_url=url or "",
                formats=formats,
            )
            # 整理结果
            schemas: list[dict[str, Any]] = []
            for fmt in formats:
                items = data.get(fmt, [])
                if items:
                    for item in items:
                        schemas.append({"format": fmt, "data": item})
            return {"schemas": schemas, "success": True}
        except Exception as e:
            logger.error(f"SchemaExtractTool 提取失败: {e}")
            return {
                "error": str(e),
                "schemas": [],
                "success": False,
            }
 class SchemaGenerateTool(Tool):
    """JSON-LD 结构化数据生成工具 - 为常见 Schema.org 类型生成标记
    当 pydantic-schemaorg 可用时提供验证，否则手动构建 JSON-LD。
    手动生成始终可用，无需外部依赖。
    """
    SUPPORTED_TYPES = {
        "Organization",
        "WebPage",
        "Article",
        "Product",
        "FAQPage",
        "HowTo",
        "LocalBusiness",
        "Person",
        "BreadcrumbList",
        "SiteNavigationElement",
    }
    def __init__(
        self,
        name: str = "schema_generate",
        description: str = "生成 Schema.org JSON-LD 结构化数据标记",
        input_schema: dict[str, Any] | None = None,
        output_schema: dict[str, Any] | None = None,
        version: str = "1.0.0",
        tags: list[str] | None = None,
    ):
        super().__init__(
            name=name,
            description=description,
            input_schema=input_schema or self._default_input_schema(),
            output_schema=output_schema or self._default_output_schema(),
            version=version,
            tags=tags or ["schema", "generation"],
        )
    @staticmethod
    def _default_input_schema() -> dict[str, Any]:
        return {
            "type": "object",
            "properties": {
                "schema_type": {
                    "type": "string",
                    "description": "Schema.org 类型名称，如 Organization、FAQPage 等",
                },
                "properties": {
                    "type": "object",
                    "description": "Schema 属性字典",
                },
            },
            "required": ["schema_type", "properties"],
        }
    @staticmethod
    def _default_output_schema() -> dict[str, Any]:
        return {
            "type": "object",
            "properties": {
                "jsonld": {"type": "string", "description": "生成的 JSON-LD 字符串"},
                "schema_type": {"type": "string", "description": "Schema 类型"},
                "success": {"type": "boolean", "description": "是否成功"},
                "error": {"type": "string", "description": "错误信息（仅失败时）"},
            },
        }
    def _generate_manual(self, schema_type: str, properties: dict[str, Any]) -> str:
        """手动构建 JSON-LD（无需外部依赖）"""
        jsonld_obj: dict[str, Any] = {
            "@context": "https://schema.org",
            "@type": schema_type,
        }
        jsonld_obj.update(properties)
        return json.dumps(jsonld_obj, ensure_ascii=False, indent=2)
    def _generate_with_schemaorg(self, schema_type: str, properties: dict[str, Any]) -> str | None:
        """使用 pydantic-schemaorg 生成 JSON-LD（带验证）"""
        if not _PYDANTIC_SCHEMAORG_AVAILABLE:
            return None
        try:
            # 尝试获取对应的 pydantic_schemaorg 类
            schema_cls = getattr(pydantic_schemaorg, schema_type, None)
            if schema_cls is None:
                return None
            instance = schema_cls(**properties)
            # pydantic_schemaorg 对象转 dict
            if hasattr(instance, "model_dump"):
                data = instance.model_dump(exclude_none=True)
            elif hasattr(instance, "dict"):
                data = instance.dict(exclude_none=True)
            else:
                return None
            jsonld_obj: dict[str, Any] = {
                "@context": "https://schema.org",
                "@type": schema_type,
            }
            jsonld_obj.update(data)
            return json.dumps(jsonld_obj, ensure_ascii=False, indent=2)
        except Exception:
            return None
    async def execute(self, **kwargs) -> dict:
        """执行 JSON-LD 生成
        Args:
            schema_type: Schema.org 类型名称（必需，如 "Organization"）
            properties: Schema 属性字典（必需）
        Returns:
            包含 jsonld 字符串、schema_type 和 success 布尔值的字典
        """
        schema_type = kwargs.get("schema_type")
        properties = kwargs.get("properties")
        if not schema_type:
            return {"error": "schema_type 参数是必需的", "schema_type": "", "success": False}
        if properties is None:
            return {"error": "properties 参数是必需的", "schema_type": schema_type, "success": False}
        if not isinstance(properties, dict):
            return {
                "error": "properties 必须是字典类型",
                "schema_type": schema_type,
                "success": False,
            }
        # 验证 schema_type
        if schema_type not in self.SUPPORTED_TYPES:
            return {
                "error": f"不支持的 schema_type: {schema_type}，支持的类型: {sorted(self.SUPPORTED_TYPES)}",
                "schema_type": schema_type,
                "success": False,
            }
        try:
            # 优先尝试使用 pydantic-schemaorg（带验证）
            jsonld = self._generate_with_schemaorg(schema_type, properties)
            # 降级到手动生成
            if jsonld is None:
                jsonld = self._generate_manual(schema_type, properties)
            return {
                "jsonld": jsonld,
                "schema_type": schema_type,
                "success": True,
            }
        except Exception as e:
            logger.error(f"SchemaGenerateTool 生成失败: {e}")
            return {
                "error": str(e),
                "schema_type": schema_type,
                "success": False,
            }
--- a/src/agentkit/tools/web_crawl.py
+++ b/src/agentkit/tools/web_crawl.py
@ -0,0 +1,159 @@
 """WebCrawlTool - 基于 Crawl4AI 的网页抓取工具，支持优雅降级"""
 import logging
 from typing import Any
 from agentkit.tools.base import Tool
 logger = logging.getLogger(__name__)
 # 检测 Crawl4AI 是否可用
 _CRAWL4AI_AVAILABLE = False
 AsyncWebCrawler = None
 JsonCssExtractionStrategy = None
 try:
    from crawl4ai import AsyncWebCrawler
    from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
    _CRAWL4AI_AVAILABLE = True
 except ImportError:
    pass
 class WebCrawlTool(Tool):
    """网页抓取工具 - 使用 Crawl4AI，可选依赖未安装时优雅降级
    支持 Markdown/HTML 输出、CSS 选择器提取、JS 渲染等待。
    当 Crawl4AI 未安装时，返回包含安装提示的错误信息。
    """
    def __init__(
        self,
        name: str = "web_crawl",
        description: str = "抓取网页内容，支持 Markdown/HTML 输出和 CSS 选择器提取",
        input_schema: dict[str, Any] | None = None,
        output_schema: dict[str, Any] | None = None,
        version: str = "1.0.0",
        tags: list[str] | None = None,
    ):
        super().__init__(
            name=name,
            description=description,
            input_schema=input_schema or self._default_input_schema(),
            output_schema=output_schema or self._default_output_schema(),
            version=version,
            tags=tags or ["web", "crawl"],
        )
    @staticmethod
    def _default_input_schema() -> dict[str, Any]:
        return {
            "type": "object",
            "properties": {
                "url": {
                    "type": "string",
                    "description": "要抓取的 URL",
                },
                "format": {
                    "type": "string",
                    "description": "输出格式：markdown 或 html",
                    "default": "markdown",
                    "enum": ["markdown", "html"],
                },
                "css_selector": {
                    "type": "string",
                    "description": "可选的 CSS 选择器，用于结构化提取",
                },
                "js_wait": {
                    "type": "number",
                    "description": "等待 JS 渲染的秒数",
                    "default": 0,
                },
            },
            "required": ["url"],
        }
    @staticmethod
    def _default_output_schema() -> dict[str, Any]:
        return {
            "type": "object",
            "properties": {
                "content": {"type": "string", "description": "抓取到的内容"},
                "status_code": {"type": "integer", "description": "HTTP 状态码"},
                "links": {"type": "array", "items": {"type": "string"}, "description": "页面中的链接"},
                "success": {"type": "boolean", "description": "是否成功"},
                "error": {"type": "string", "description": "错误信息（仅失败时）"},
            },
        }
    async def execute(self, **kwargs) -> dict:
        """执行网页抓取
        Args:
            url: 要抓取的 URL（必需）
            format: 输出格式 - "markdown" 或 "html"（默认 "markdown"）
            css_selector: 可选的 CSS 选择器，用于结构化提取
            js_wait: 等待 JS 渲染的秒数（默认 0）
        Returns:
            包含 content, status_code, links, success 的字典
        """
        url = kwargs.get("url")
        if not url:
            return {"error": "url 参数是必需的", "success": False}
        output_format = kwargs.get("format", "markdown")
        css_selector = kwargs.get("css_selector")
        js_wait = kwargs.get("js_wait", 0)
        # 优雅降级：Crawl4AI 未安装
        if not _CRAWL4AI_AVAILABLE:
            return {
                "error": "Crawl4AI not installed. Run: pip install crawl4ai",
                "success": False,
            }
        try:
            extraction_strategy = None
            if css_selector:
                extraction_strategy = JsonCssExtractionStrategy(css_selector)
            async with AsyncWebCrawler() as crawler:
                result = await crawler.arun(
                    url=url,
                    extraction_strategy=extraction_strategy,
                    js_wait=js_wait if js_wait else None,
                )
            # 提取内容
            if output_format == "html":
                content = result.html or ""
            else:
                content = result.markdown or ""
            # 提取链接
            links: list[str] = []
            if hasattr(result, "links") and result.links:
                links = result.links if isinstance(result.links, list) else []
            status_code = result.status_code if hasattr(result, "status_code") else 200
            response: dict[str, Any] = {
                "content": content,
                "status_code": status_code,
                "links": links,
                "success": True,
            }
            # 如果使用了 CSS 选择器提取，附加提取结果
            if extraction_strategy and hasattr(result, "extracted_content") and result.extracted_content:
                response["extracted"] = result.extracted_content
            return response
        except Exception as e:
            logger.error(f"WebCrawlTool 抓取失败: {url} - {e}")
            return {
                "error": str(e),
                "success": False,
            }
--- a/tests/unit/test_schema_tools.py
+++ b/tests/unit/test_schema_tools.py
@ -0,0 +1,413 @@
 """Schema 工具集单元测试 - SchemaExtractTool + SchemaGenerateTool"""
 import json
 from unittest.mock import AsyncMock, MagicMock, patch
 import pytest
 from agentkit.tools.schema_tools import SchemaExtractTool, SchemaGenerateTool
 # ========== SchemaExtractTool 测试 ==========
 class TestSchemaExtractToolConstruction:
    """测试 SchemaExtractTool 构造"""
    def test_default_construction(self):
        tool = SchemaExtractTool()
        assert tool.name == "schema_extract"
        assert tool.input_schema is not None
        assert tool.output_schema is not None
        assert "url_or_html" in tool.input_schema["properties"]
        assert tool.input_schema["required"] == ["url_or_html"]
    def test_custom_construction(self):
        tool = SchemaExtractTool(
            name="my_extractor",
            description="自定义提取器",
            version="2.0.0",
        )
        assert tool.name == "my_extractor"
    def test_supported_formats(self):
        tool = SchemaExtractTool()
        assert "json-ld" in tool.SUPPORTED_FORMATS
        assert "microdata" in tool.SUPPORTED_FORMATS
        assert "rdfa" in tool.SUPPORTED_FORMATS
        assert "dublincore" in tool.SUPPORTED_FORMATS
    def test_to_dict(self):
        tool = SchemaExtractTool()
        d = tool.to_dict()
        assert d["name"] == "schema_extract"
 class TestSchemaExtractToolGracefulDegradation:
    """测试 extruct 不可用时的优雅降级"""
    @pytest.mark.asyncio
    async def test_execute_without_extruct(self):
        with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", False):
            tool = SchemaExtractTool()
            result = await tool.execute(url_or_html="<html></html>")
            assert result["success"] is False
            assert "extruct not installed" in result["error"]
            assert "pip install extruct" in result["error"]
            assert result["schemas"] == []
 class TestSchemaExtractToolValidation:
    """测试输入验证"""
    @pytest.mark.asyncio
    async def test_execute_missing_url_or_html(self):
        tool = SchemaExtractTool()
        result = await tool.execute()
        assert result["success"] is False
        assert "url_or_html" in result["error"]
    @pytest.mark.asyncio
    async def test_execute_empty_url_or_html(self):
        tool = SchemaExtractTool()
        result = await tool.execute(url_or_html="")
        assert result["success"] is False
    @pytest.mark.asyncio
    async def test_execute_invalid_format(self):
        with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True):
            tool = SchemaExtractTool()
            result = await tool.execute(url_or_html="<html></html>", formats=["invalid-format"])
            assert result["success"] is False
            assert "不支持" in result["error"] or "invalid" in result["error"].lower()
 class TestSchemaExtractToolWithMockedExtruct:
    """使用 mock extruct 测试提取逻辑"""
    SAMPLE_HTML_WITH_JSONLD = """
    <html>
    <head>
        <script type="application/ld+json">
        {
            "@context": "https://schema.org",
            "@type": "Organization",
            "name": "Test Corp"
        }
        </script>
    </head>
    <body></body>
    </html>
    """
    @pytest.mark.asyncio
    async def test_extract_jsonld_from_html(self):
        """测试从 HTML 中提取 JSON-LD"""
        mock_extruct = MagicMock()
        mock_extruct.extract.return_value = {
            "json-ld": [
                {"@context": "https://schema.org", "@type": "Organization", "name": "Test Corp"}
            ]
        }
        with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \
             patch("agentkit.tools.schema_tools.extruct", mock_extruct):
            tool = SchemaExtractTool()
            result = await tool.execute(url_or_html=self.SAMPLE_HTML_WITH_JSONLD)
            assert result["success"] is True
            assert len(result["schemas"]) == 1
            assert result["schemas"][0]["format"] == "json-ld"
            assert result["schemas"][0]["data"]["@type"] == "Organization"
            assert result["schemas"][0]["data"]["name"] == "Test Corp"
    @pytest.mark.asyncio
    async def test_extract_no_schema_data(self):
        """测试 HTML 中没有结构化数据"""
        mock_extruct = MagicMock()
        mock_extruct.extract.return_value = {"json-ld": []}
        with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \
             patch("agentkit.tools.schema_tools.extruct", mock_extruct):
            tool = SchemaExtractTool()
            result = await tool.execute(url_or_html="<html><body>No schema</body></html>")
            assert result["success"] is True
            assert result["schemas"] == []
    @pytest.mark.asyncio
    async def test_extract_multiple_formats(self):
        """测试同时提取多种格式"""
        mock_extruct = MagicMock()
        mock_extruct.extract.return_value = {
            "json-ld": [{"@type": "Organization", "name": "Corp"}],
            "microdata": [{"type": "Product", "name": "Item"}],
        }
        with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \
             patch("agentkit.tools.schema_tools.extruct", mock_extruct):
            tool = SchemaExtractTool()
            result = await tool.execute(
                url_or_html="<html></html>",
                formats=["json-ld", "microdata"],
            )
            assert result["success"] is True
            assert len(result["schemas"]) == 2
            formats_found = {s["format"] for s in result["schemas"]}
            assert "json-ld" in formats_found
            assert "microdata" in formats_found
    @pytest.mark.asyncio
    async def test_extract_error_handling(self):
        """测试提取异常处理"""
        mock_extruct = MagicMock()
        mock_extruct.extract.side_effect = Exception("Parse error")
        with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \
             patch("agentkit.tools.schema_tools.extruct", mock_extruct):
            tool = SchemaExtractTool()
            result = await tool.execute(url_or_html="<html></html>")
            assert result["success"] is False
            assert "Parse error" in result["error"]
    @pytest.mark.asyncio
    async def test_extract_with_url(self):
        """测试从 URL 提取（需要先获取 HTML）"""
        mock_extruct = MagicMock()
        mock_extruct.extract.return_value = {
            "json-ld": [{"@type": "WebPage"}]
        }
        with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \
             patch("agentkit.tools.schema_tools.extruct", mock_extruct), \
             patch("urllib.request.urlopen") as mock_urlopen:
            mock_resp = MagicMock()
            mock_resp.read.return_value = b"<html><body>Test</body></html>"
            mock_resp.__enter__ = MagicMock(return_value=mock_resp)
            mock_resp.__exit__ = MagicMock(return_value=None)
            mock_urlopen.return_value = mock_resp
            tool = SchemaExtractTool()
            result = await tool.execute(url_or_html="https://example.com")
            assert result["success"] is True
 # ========== SchemaGenerateTool 测试 ==========
 class TestSchemaGenerateToolConstruction:
    """测试 SchemaGenerateTool 构造"""
    def test_default_construction(self):
        tool = SchemaGenerateTool()
        assert tool.name == "schema_generate"
        assert tool.input_schema is not None
        assert tool.output_schema is not None
        assert "schema_type" in tool.input_schema["properties"]
        assert "properties" in tool.input_schema["properties"]
    def test_supported_types(self):
        tool = SchemaGenerateTool()
        assert "Organization" in tool.SUPPORTED_TYPES
        assert "FAQPage" in tool.SUPPORTED_TYPES
        assert "Article" in tool.SUPPORTED_TYPES
        assert "Product" in tool.SUPPORTED_TYPES
        assert "HowTo" in tool.SUPPORTED_TYPES
        assert "LocalBusiness" in tool.SUPPORTED_TYPES
        assert "Person" in tool.SUPPORTED_TYPES
        assert "BreadcrumbList" in tool.SUPPORTED_TYPES
        assert "SiteNavigationElement" in tool.SUPPORTED_TYPES
        assert "WebPage" in tool.SUPPORTED_TYPES
 class TestSchemaGenerateToolValidation:
    """测试输入验证"""
    @pytest.mark.asyncio
    async def test_execute_missing_schema_type(self):
        tool = SchemaGenerateTool()
        result = await tool.execute(properties={"name": "Test"})
        assert result["success"] is False
        assert "schema_type" in result["error"]
    @pytest.mark.asyncio
    async def test_execute_missing_properties(self):
        tool = SchemaGenerateTool()
        result = await tool.execute(schema_type="Organization")
        assert result["success"] is False
        assert "properties" in result["error"]
    @pytest.mark.asyncio
    async def test_execute_invalid_schema_type(self):
        tool = SchemaGenerateTool()
        result = await tool.execute(schema_type="InvalidType", properties={"name": "Test"})
        assert result["success"] is False
        assert "不支持" in result["error"] or "InvalidType" in result["error"]
    @pytest.mark.asyncio
    async def test_execute_properties_not_dict(self):
        tool = SchemaGenerateTool()
        result = await tool.execute(schema_type="Organization", properties="not a dict")
        assert result["success"] is False
        assert "字典" in result["error"] or "dict" in result["error"].lower()
 class TestSchemaGenerateToolManualGeneration:
    """测试手动 JSON-LD 生成（始终可用，无需外部依赖）"""
    @pytest.mark.asyncio
    async def test_generate_organization(self):
        """测试生成 Organization 类型"""
        with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
            tool = SchemaGenerateTool()
            result = await tool.execute(
                schema_type="Organization",
                properties={"name": "Fischer AI", "url": "https://fischer.ai"},
            )
            assert result["success"] is True
            assert result["schema_type"] == "Organization"
            jsonld = json.loads(result["jsonld"])
            assert jsonld["@context"] == "https://schema.org"
            assert jsonld["@type"] == "Organization"
            assert jsonld["name"] == "Fischer AI"
            assert jsonld["url"] == "https://fischer.ai"
    @pytest.mark.asyncio
    async def test_generate_faq_page(self):
        """测试生成 FAQPage 类型"""
        with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
            tool = SchemaGenerateTool()
            result = await tool.execute(
                schema_type="FAQPage",
                properties={
                    "mainEntity": [
                        {
                            "@type": "Question",
                            "name": "What is GEO?",
                            "acceptedAnswer": {
                                "@type": "Answer",
                                "text": "Generative Engine Optimization",
                            },
                        }
                    ]
                },
            )
            assert result["success"] is True
            jsonld = json.loads(result["jsonld"])
            assert jsonld["@type"] == "FAQPage"
            assert len(jsonld["mainEntity"]) == 1
    @pytest.mark.asyncio
    async def test_generate_article(self):
        """测试生成 Article 类型"""
        with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
            tool = SchemaGenerateTool()
            result = await tool.execute(
                schema_type="Article",
                properties={
                    "headline": "Test Article",
                    "author": {"@type": "Person", "name": "John"},
                },
            )
            assert result["success"] is True
            jsonld = json.loads(result["jsonld"])
            assert jsonld["@type"] == "Article"
            assert jsonld["headline"] == "Test Article"
    @pytest.mark.asyncio
    async def test_generate_breadcrumb_list(self):
        """测试生成 BreadcrumbList 类型"""
        with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
            tool = SchemaGenerateTool()
            result = await tool.execute(
                schema_type="BreadcrumbList",
                properties={
                    "itemListElement": [
                        {"@type": "ListItem", "position": 1, "name": "Home"},
                    ]
                },
            )
            assert result["success"] is True
            jsonld = json.loads(result["jsonld"])
            assert jsonld["@type"] == "BreadcrumbList"
    @pytest.mark.asyncio
    async def test_output_is_valid_jsonld(self):
        """测试输出是有效的 JSON-LD（包含 @context 和 @type）"""
        with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
            tool = SchemaGenerateTool()
            for schema_type in ["Organization", "WebPage", "Product", "Person"]:
                result = await tool.execute(
                    schema_type=schema_type,
                    properties={"name": f"Test {schema_type}"},
                )
                assert result["success"] is True
                jsonld = json.loads(result["jsonld"])
                assert "@context" in jsonld
                assert jsonld["@context"] == "https://schema.org"
                assert "@type" in jsonld
                assert jsonld["@type"] == schema_type
    @pytest.mark.asyncio
    async def test_manual_generation_preserves_chinese(self):
        """测试手动生成保留中文字符"""
        with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
            tool = SchemaGenerateTool()
            result = await tool.execute(
                schema_type="Organization",
                properties={"name": "费舍尔科技", "description": "AI 驱动的企业平台"},
            )
            assert result["success"] is True
            jsonld = json.loads(result["jsonld"])
            assert jsonld["name"] == "费舍尔科技"
            assert jsonld["description"] == "AI 驱动的企业平台"
 class TestSchemaGenerateToolWithPydanticSchemaorg:
    """测试 pydantic-schemaorg 可用时的行为"""
    @pytest.mark.asyncio
    async def test_fallback_to_manual_when_schemaorg_fails(self):
        """当 pydantic-schemaorg 构建失败时，降级到手动生成"""
        mock_schemaorg = MagicMock()
        # 让 getattr 返回 None，模拟类型不存在
        mock_schemaorg.Organization = None
        with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", True), \
             patch("agentkit.tools.schema_tools.pydantic_schemaorg", mock_schemaorg):
            tool = SchemaGenerateTool()
            result = await tool.execute(
                schema_type="Organization",
                properties={"name": "Test"},
            )
            # 应该降级到手动生成
            assert result["success"] is True
            jsonld = json.loads(result["jsonld"])
            assert jsonld["@type"] == "Organization"
            assert jsonld["name"] == "Test"
    @pytest.mark.asyncio
    async def test_schemaorg_not_available_uses_manual(self):
        """当 pydantic-schemaorg 不可用时，使用手动生成"""
        with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
            tool = SchemaGenerateTool()
            result = await tool.execute(
                schema_type="Organization",
                properties={"name": "Manual Corp"},
            )
            assert result["success"] is True
            jsonld = json.loads(result["jsonld"])
            assert jsonld["name"] == "Manual Corp"
 class TestSchemaGenerateToolSafeExecute:
    """测试 safe_execute 钩子"""
    @pytest.mark.asyncio
    async def test_safe_execute_success(self):
        with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False):
            tool = SchemaGenerateTool()
            result = await tool.safe_execute(
                schema_type="Organization",
                properties={"name": "Test"},
            )
            assert result["success"] is True
--- a/tests/unit/test_web_crawl_tool.py
+++ b/tests/unit/test_web_crawl_tool.py
@ -0,0 +1,201 @@
 """WebCrawlTool 单元测试"""
 import sys
 import types
 from unittest.mock import AsyncMock, MagicMock, patch
 import pytest
 from agentkit.tools.web_crawl import WebCrawlTool
 class TestWebCrawlToolConstruction:
    """测试 WebCrawlTool 构造"""
    def test_default_construction(self):
        tool = WebCrawlTool()
        assert tool.name == "web_crawl"
        assert "抓取" in tool.description or "crawl" in tool.description.lower()
        assert tool.input_schema is not None
        assert tool.output_schema is not None
        assert "url" in tool.input_schema["properties"]
        assert tool.input_schema["required"] == ["url"]
    def test_custom_construction(self):
        tool = WebCrawlTool(
            name="my_crawler",
            description="自定义爬虫",
            version="2.0.0",
            tags=["custom"],
        )
        assert tool.name == "my_crawler"
        assert tool.description == "自定义爬虫"
        assert tool.version == "2.0.0"
        assert tool.tags == ["custom"]
    def test_to_dict(self):
        tool = WebCrawlTool()
        d = tool.to_dict()
        assert d["name"] == "web_crawl"
        assert "input_schema" in d
        assert "output_schema" in d
    def test_repr(self):
        tool = WebCrawlTool()
        r = repr(tool)
        assert "WebCrawlTool" in r
        assert "web_crawl" in r
 class TestWebCrawlToolGracefulDegradation:
    """测试 Crawl4AI 不可用时的优雅降级"""
    @pytest.mark.asyncio
    async def test_execute_without_crawl4ai(self):
        """当 Crawl4AI 未安装时，返回安装提示"""
        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", False):
            tool = WebCrawlTool()
            result = await tool.execute(url="https://example.com")
            assert result["success"] is False
            assert "Crawl4AI not installed" in result["error"]
            assert "pip install crawl4ai" in result["error"]
    @pytest.mark.asyncio
    async def test_safe_execute_without_crawl4ai(self):
        """safe_execute 在 Crawl4AI 不可用时也应正常返回"""
        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", False):
            tool = WebCrawlTool()
            result = await tool.safe_execute(url="https://example.com")
            assert result["success"] is False
 class TestWebCrawlToolValidation:
    """测试输入验证"""
    @pytest.mark.asyncio
    async def test_execute_missing_url(self):
        tool = WebCrawlTool()
        result = await tool.execute()
        assert result["success"] is False
        assert "url" in result["error"]
    @pytest.mark.asyncio
    async def test_execute_empty_url(self):
        tool = WebCrawlTool()
        result = await tool.execute(url="")
        assert result["success"] is False
 class TestWebCrawlToolWithMockedCrawl4AI:
    """使用 mock Crawl4AI 测试正常抓取逻辑"""
    def _make_mock_crawler(self, markdown="# Hello", html="<h1>Hello</h1>", links=None, status_code=200):
        """创建 mock AsyncWebCrawler"""
        mock_result = MagicMock()
        mock_result.markdown = markdown
        mock_result.html = html
        mock_result.links = links or ["https://example.com/page1"]
        mock_result.status_code = status_code
        mock_result.extracted_content = None
        mock_crawler = AsyncMock()
        mock_crawler.arun = AsyncMock(return_value=mock_result)
        mock_crawler.__aenter__ = AsyncMock(return_value=mock_crawler)
        mock_crawler.__aexit__ = AsyncMock(return_value=None)
        return mock_crawler, mock_result
    @pytest.mark.asyncio
    async def test_execute_markdown_format(self):
        """测试 Markdown 格式输出"""
        mock_crawler, _ = self._make_mock_crawler(markdown="# Test Page")
        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
            tool = WebCrawlTool()
            result = await tool.execute(url="https://example.com", format="markdown")
            assert result["success"] is True
            assert result["content"] == "# Test Page"
            assert result["status_code"] == 200
    @pytest.mark.asyncio
    async def test_execute_html_format(self):
        """测试 HTML 格式输出"""
        mock_crawler, _ = self._make_mock_crawler(html="<h1>Test</h1>")
        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
            tool = WebCrawlTool()
            result = await tool.execute(url="https://example.com", format="html")
            assert result["success"] is True
            assert result["content"] == "<h1>Test</h1>"
    @pytest.mark.asyncio
    async def test_execute_with_links(self):
        """测试链接提取"""
        mock_crawler, _ = self._make_mock_crawler(links=["https://example.com/a", "https://example.com/b"])
        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
            tool = WebCrawlTool()
            result = await tool.execute(url="https://example.com")
            assert result["success"] is True
            assert len(result["links"]) == 2
    @pytest.mark.asyncio
    async def test_execute_with_css_selector(self):
        """测试 CSS 选择器提取"""
        mock_crawler, mock_result = self._make_mock_crawler()
        mock_result.extracted_content = '{"title": "Test"}'
        mock_strategy_cls = MagicMock(return_value=MagicMock())
        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler), \
             patch("agentkit.tools.web_crawl.JsonCssExtractionStrategy", mock_strategy_cls):
            tool = WebCrawlTool()
            result = await tool.execute(url="https://example.com", css_selector="h1")
            assert result["success"] is True
            assert "extracted" in result
            mock_strategy_cls.assert_called_once_with("h1")
    @pytest.mark.asyncio
    async def test_execute_with_js_wait(self):
        """测试 JS 等待参数"""
        mock_crawler, _ = self._make_mock_crawler()
        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
            tool = WebCrawlTool()
            result = await tool.execute(url="https://example.com", js_wait=2)
            assert result["success"] is True
            # 验证 arun 被调用时传入了 js_wait 参数
            call_kwargs = mock_crawler.arun.call_args
            assert call_kwargs[1].get("js_wait") == 2 or call_kwargs[1].get("js_wait") is not None
    @pytest.mark.asyncio
    async def test_execute_crawl_error(self):
        """测试抓取异常处理"""
        mock_crawler = AsyncMock()
        mock_crawler.arun = AsyncMock(side_effect=Exception("Connection timeout"))
        mock_crawler.__aenter__ = AsyncMock(return_value=mock_crawler)
        mock_crawler.__aexit__ = AsyncMock(return_value=None)
        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
            tool = WebCrawlTool()
            result = await tool.execute(url="https://example.com")
            assert result["success"] is False
            assert "Connection timeout" in result["error"]
    @pytest.mark.asyncio
    async def test_execute_default_format_is_markdown(self):
        """测试默认输出格式为 markdown"""
        mock_crawler, _ = self._make_mock_crawler(markdown="MD content", html="HTML content")
        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
            tool = WebCrawlTool()
            result = await tool.execute(url="https://example.com")
            assert result["success"] is True
            assert result["content"] == "MD content"