diff --git a/src/agentkit/tools/__init__.py b/src/agentkit/tools/__init__.py index f136aa6..7ad2fa2 100644 --- a/src/agentkit/tools/__init__.py +++ b/src/agentkit/tools/__init__.py @@ -6,6 +6,9 @@ from agentkit.tools.agent_tool import AgentTool from agentkit.tools.mcp_tool import MCPTool from agentkit.tools.registry import ToolRegistry from agentkit.tools.composition import SequentialChain, ParallelFanOut, DynamicSelector +from agentkit.tools.web_crawl import WebCrawlTool +from agentkit.tools.schema_tools import SchemaExtractTool, SchemaGenerateTool +from agentkit.tools.baidu_search import BaiduSearchTool __all__ = [ "Tool", @@ -16,4 +19,8 @@ __all__ = [ "SequentialChain", "ParallelFanOut", "DynamicSelector", + "WebCrawlTool", + "SchemaExtractTool", + "SchemaGenerateTool", + "BaiduSearchTool", ] diff --git a/src/agentkit/tools/schema_tools.py b/src/agentkit/tools/schema_tools.py new file mode 100644 index 0000000..4b72413 --- /dev/null +++ b/src/agentkit/tools/schema_tools.py @@ -0,0 +1,344 @@ +"""Schema 工具集 - 结构化数据提取与生成 + +SchemaExtractTool: 从 HTML 中提取 JSON-LD / Microdata / RDFa 等结构化数据 +SchemaGenerateTool: 生成 Schema.org JSON-LD 标记 +""" + +import json +import logging +from typing import Any + +from agentkit.tools.base import Tool + +logger = logging.getLogger(__name__) + +# 检测 extruct 是否可用 +_EXTRUCT_AVAILABLE = False +extruct = None +try: + import extruct + + _EXTRUCT_AVAILABLE = True +except ImportError: + pass + +# 检测 pydantic_schemaorg 是否可用 +_PYDANTIC_SCHEMAORG_AVAILABLE = False +pydantic_schemaorg = None +try: + import pydantic_schemaorg + + _PYDANTIC_SCHEMAORG_AVAILABLE = True +except ImportError: + pass + + +class SchemaExtractTool(Tool): + """结构化数据提取工具 - 从 HTML 中提取 JSON-LD、Microdata、RDFa 等 + + 使用 extruct 库进行提取,当 extruct 未安装时优雅降级。 + """ + + SUPPORTED_FORMATS = {"json-ld", "microdata", "rdfa", "dublincore"} + + def __init__( + self, + name: str = "schema_extract", + description: str = "从网页 HTML 中提取结构化数据(JSON-LD、Microdata、RDFa 等)", + input_schema: dict[str, Any] | None = None, + output_schema: dict[str, Any] | None = None, + version: str = "1.0.0", + tags: list[str] | None = None, + ): + super().__init__( + name=name, + description=description, + input_schema=input_schema or self._default_input_schema(), + output_schema=output_schema or self._default_output_schema(), + version=version, + tags=tags or ["schema", "extraction"], + ) + + @staticmethod + def _default_input_schema() -> dict[str, Any]: + return { + "type": "object", + "properties": { + "url_or_html": { + "type": "string", + "description": "要提取的 URL 或原始 HTML 字符串", + }, + "formats": { + "type": "array", + "items": {"type": "string"}, + "description": "要提取的格式列表", + "default": ["json-ld"], + }, + }, + "required": ["url_or_html"], + } + + @staticmethod + def _default_output_schema() -> dict[str, Any]: + return { + "type": "object", + "properties": { + "schemas": { + "type": "array", + "items": { + "type": "object", + "properties": { + "format": {"type": "string"}, + "data": {"type": "object"}, + }, + }, + "description": "提取到的结构化数据列表", + }, + "success": {"type": "boolean", "description": "是否成功"}, + "error": {"type": "string", "description": "错误信息(仅失败时)"}, + }, + } + + def _is_url(self, text: str) -> bool: + """判断输入是 URL 还是 HTML""" + return text.strip().startswith(("http://", "https://")) + + async def execute(self, **kwargs) -> dict: + """执行结构化数据提取 + + Args: + url_or_html: URL 或原始 HTML 字符串(必需) + formats: 要提取的格式列表(默认 ["json-ld"]) + 可选: "json-ld", "microdata", "rdfa", "dublincore" + + Returns: + 包含 schemas 列表和 success 布尔值的字典 + """ + url_or_html = kwargs.get("url_or_html") + if not url_or_html: + return {"error": "url_or_html 参数是必需的", "schemas": [], "success": False} + + formats = kwargs.get("formats", ["json-ld"]) + # 验证格式 + invalid_formats = set(formats) - self.SUPPORTED_FORMATS + if invalid_formats: + return { + "error": f"不支持的格式: {invalid_formats},支持的格式: {self.SUPPORTED_FORMATS}", + "schemas": [], + "success": False, + } + + # 优雅降级:extruct 未安装 + if not _EXTRUCT_AVAILABLE: + return { + "error": "extruct not installed. Run: pip install extruct", + "schemas": [], + "success": False, + } + + try: + html = url_or_html + url = None + + # 如果输入是 URL,先获取 HTML + if self._is_url(url_or_html): + url = url_or_html + try: + import urllib.request + + req = urllib.request.Request(url, headers={"User-Agent": "AgentKit/1.0"}) + with urllib.request.urlopen(req, timeout=30) as resp: + html = resp.read().decode("utf-8", errors="replace") + except Exception as e: + return { + "error": f"获取 URL 内容失败: {e}", + "schemas": [], + "success": False, + } + + # 使用 extruct 提取 + data = extruct.extract( + html, + base_url=url or "", + formats=formats, + ) + + # 整理结果 + schemas: list[dict[str, Any]] = [] + for fmt in formats: + items = data.get(fmt, []) + if items: + for item in items: + schemas.append({"format": fmt, "data": item}) + + return {"schemas": schemas, "success": True} + + except Exception as e: + logger.error(f"SchemaExtractTool 提取失败: {e}") + return { + "error": str(e), + "schemas": [], + "success": False, + } + + +class SchemaGenerateTool(Tool): + """JSON-LD 结构化数据生成工具 - 为常见 Schema.org 类型生成标记 + + 当 pydantic-schemaorg 可用时提供验证,否则手动构建 JSON-LD。 + 手动生成始终可用,无需外部依赖。 + """ + + SUPPORTED_TYPES = { + "Organization", + "WebPage", + "Article", + "Product", + "FAQPage", + "HowTo", + "LocalBusiness", + "Person", + "BreadcrumbList", + "SiteNavigationElement", + } + + def __init__( + self, + name: str = "schema_generate", + description: str = "生成 Schema.org JSON-LD 结构化数据标记", + input_schema: dict[str, Any] | None = None, + output_schema: dict[str, Any] | None = None, + version: str = "1.0.0", + tags: list[str] | None = None, + ): + super().__init__( + name=name, + description=description, + input_schema=input_schema or self._default_input_schema(), + output_schema=output_schema or self._default_output_schema(), + version=version, + tags=tags or ["schema", "generation"], + ) + + @staticmethod + def _default_input_schema() -> dict[str, Any]: + return { + "type": "object", + "properties": { + "schema_type": { + "type": "string", + "description": "Schema.org 类型名称,如 Organization、FAQPage 等", + }, + "properties": { + "type": "object", + "description": "Schema 属性字典", + }, + }, + "required": ["schema_type", "properties"], + } + + @staticmethod + def _default_output_schema() -> dict[str, Any]: + return { + "type": "object", + "properties": { + "jsonld": {"type": "string", "description": "生成的 JSON-LD 字符串"}, + "schema_type": {"type": "string", "description": "Schema 类型"}, + "success": {"type": "boolean", "description": "是否成功"}, + "error": {"type": "string", "description": "错误信息(仅失败时)"}, + }, + } + + def _generate_manual(self, schema_type: str, properties: dict[str, Any]) -> str: + """手动构建 JSON-LD(无需外部依赖)""" + jsonld_obj: dict[str, Any] = { + "@context": "https://schema.org", + "@type": schema_type, + } + jsonld_obj.update(properties) + return json.dumps(jsonld_obj, ensure_ascii=False, indent=2) + + def _generate_with_schemaorg(self, schema_type: str, properties: dict[str, Any]) -> str | None: + """使用 pydantic-schemaorg 生成 JSON-LD(带验证)""" + if not _PYDANTIC_SCHEMAORG_AVAILABLE: + return None + + try: + # 尝试获取对应的 pydantic_schemaorg 类 + schema_cls = getattr(pydantic_schemaorg, schema_type, None) + if schema_cls is None: + return None + + instance = schema_cls(**properties) + # pydantic_schemaorg 对象转 dict + if hasattr(instance, "model_dump"): + data = instance.model_dump(exclude_none=True) + elif hasattr(instance, "dict"): + data = instance.dict(exclude_none=True) + else: + return None + + jsonld_obj: dict[str, Any] = { + "@context": "https://schema.org", + "@type": schema_type, + } + jsonld_obj.update(data) + return json.dumps(jsonld_obj, ensure_ascii=False, indent=2) + except Exception: + return None + + async def execute(self, **kwargs) -> dict: + """执行 JSON-LD 生成 + + Args: + schema_type: Schema.org 类型名称(必需,如 "Organization") + properties: Schema 属性字典(必需) + + Returns: + 包含 jsonld 字符串、schema_type 和 success 布尔值的字典 + """ + schema_type = kwargs.get("schema_type") + properties = kwargs.get("properties") + + if not schema_type: + return {"error": "schema_type 参数是必需的", "schema_type": "", "success": False} + + if properties is None: + return {"error": "properties 参数是必需的", "schema_type": schema_type, "success": False} + + if not isinstance(properties, dict): + return { + "error": "properties 必须是字典类型", + "schema_type": schema_type, + "success": False, + } + + # 验证 schema_type + if schema_type not in self.SUPPORTED_TYPES: + return { + "error": f"不支持的 schema_type: {schema_type},支持的类型: {sorted(self.SUPPORTED_TYPES)}", + "schema_type": schema_type, + "success": False, + } + + try: + # 优先尝试使用 pydantic-schemaorg(带验证) + jsonld = self._generate_with_schemaorg(schema_type, properties) + + # 降级到手动生成 + if jsonld is None: + jsonld = self._generate_manual(schema_type, properties) + + return { + "jsonld": jsonld, + "schema_type": schema_type, + "success": True, + } + + except Exception as e: + logger.error(f"SchemaGenerateTool 生成失败: {e}") + return { + "error": str(e), + "schema_type": schema_type, + "success": False, + } diff --git a/src/agentkit/tools/web_crawl.py b/src/agentkit/tools/web_crawl.py new file mode 100644 index 0000000..cac5c91 --- /dev/null +++ b/src/agentkit/tools/web_crawl.py @@ -0,0 +1,159 @@ +"""WebCrawlTool - 基于 Crawl4AI 的网页抓取工具,支持优雅降级""" + +import logging +from typing import Any + +from agentkit.tools.base import Tool + +logger = logging.getLogger(__name__) + +# 检测 Crawl4AI 是否可用 +_CRAWL4AI_AVAILABLE = False +AsyncWebCrawler = None +JsonCssExtractionStrategy = None +try: + from crawl4ai import AsyncWebCrawler + from crawl4ai.extraction_strategy import JsonCssExtractionStrategy + + _CRAWL4AI_AVAILABLE = True +except ImportError: + pass + + +class WebCrawlTool(Tool): + """网页抓取工具 - 使用 Crawl4AI,可选依赖未安装时优雅降级 + + 支持 Markdown/HTML 输出、CSS 选择器提取、JS 渲染等待。 + 当 Crawl4AI 未安装时,返回包含安装提示的错误信息。 + """ + + def __init__( + self, + name: str = "web_crawl", + description: str = "抓取网页内容,支持 Markdown/HTML 输出和 CSS 选择器提取", + input_schema: dict[str, Any] | None = None, + output_schema: dict[str, Any] | None = None, + version: str = "1.0.0", + tags: list[str] | None = None, + ): + super().__init__( + name=name, + description=description, + input_schema=input_schema or self._default_input_schema(), + output_schema=output_schema or self._default_output_schema(), + version=version, + tags=tags or ["web", "crawl"], + ) + + @staticmethod + def _default_input_schema() -> dict[str, Any]: + return { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "要抓取的 URL", + }, + "format": { + "type": "string", + "description": "输出格式:markdown 或 html", + "default": "markdown", + "enum": ["markdown", "html"], + }, + "css_selector": { + "type": "string", + "description": "可选的 CSS 选择器,用于结构化提取", + }, + "js_wait": { + "type": "number", + "description": "等待 JS 渲染的秒数", + "default": 0, + }, + }, + "required": ["url"], + } + + @staticmethod + def _default_output_schema() -> dict[str, Any]: + return { + "type": "object", + "properties": { + "content": {"type": "string", "description": "抓取到的内容"}, + "status_code": {"type": "integer", "description": "HTTP 状态码"}, + "links": {"type": "array", "items": {"type": "string"}, "description": "页面中的链接"}, + "success": {"type": "boolean", "description": "是否成功"}, + "error": {"type": "string", "description": "错误信息(仅失败时)"}, + }, + } + + async def execute(self, **kwargs) -> dict: + """执行网页抓取 + + Args: + url: 要抓取的 URL(必需) + format: 输出格式 - "markdown" 或 "html"(默认 "markdown") + css_selector: 可选的 CSS 选择器,用于结构化提取 + js_wait: 等待 JS 渲染的秒数(默认 0) + + Returns: + 包含 content, status_code, links, success 的字典 + """ + url = kwargs.get("url") + if not url: + return {"error": "url 参数是必需的", "success": False} + + output_format = kwargs.get("format", "markdown") + css_selector = kwargs.get("css_selector") + js_wait = kwargs.get("js_wait", 0) + + # 优雅降级:Crawl4AI 未安装 + if not _CRAWL4AI_AVAILABLE: + return { + "error": "Crawl4AI not installed. Run: pip install crawl4ai", + "success": False, + } + + try: + extraction_strategy = None + if css_selector: + extraction_strategy = JsonCssExtractionStrategy(css_selector) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url=url, + extraction_strategy=extraction_strategy, + js_wait=js_wait if js_wait else None, + ) + + # 提取内容 + if output_format == "html": + content = result.html or "" + else: + content = result.markdown or "" + + # 提取链接 + links: list[str] = [] + if hasattr(result, "links") and result.links: + links = result.links if isinstance(result.links, list) else [] + + status_code = result.status_code if hasattr(result, "status_code") else 200 + + response: dict[str, Any] = { + "content": content, + "status_code": status_code, + "links": links, + "success": True, + } + + # 如果使用了 CSS 选择器提取,附加提取结果 + if extraction_strategy and hasattr(result, "extracted_content") and result.extracted_content: + response["extracted"] = result.extracted_content + + return response + + except Exception as e: + logger.error(f"WebCrawlTool 抓取失败: {url} - {e}") + return { + "error": str(e), + "success": False, + } diff --git a/tests/unit/test_schema_tools.py b/tests/unit/test_schema_tools.py new file mode 100644 index 0000000..9c6a2b3 --- /dev/null +++ b/tests/unit/test_schema_tools.py @@ -0,0 +1,413 @@ +"""Schema 工具集单元测试 - SchemaExtractTool + SchemaGenerateTool""" + +import json +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from agentkit.tools.schema_tools import SchemaExtractTool, SchemaGenerateTool + + +# ========== SchemaExtractTool 测试 ========== + + +class TestSchemaExtractToolConstruction: + """测试 SchemaExtractTool 构造""" + + def test_default_construction(self): + tool = SchemaExtractTool() + assert tool.name == "schema_extract" + assert tool.input_schema is not None + assert tool.output_schema is not None + assert "url_or_html" in tool.input_schema["properties"] + assert tool.input_schema["required"] == ["url_or_html"] + + def test_custom_construction(self): + tool = SchemaExtractTool( + name="my_extractor", + description="自定义提取器", + version="2.0.0", + ) + assert tool.name == "my_extractor" + + def test_supported_formats(self): + tool = SchemaExtractTool() + assert "json-ld" in tool.SUPPORTED_FORMATS + assert "microdata" in tool.SUPPORTED_FORMATS + assert "rdfa" in tool.SUPPORTED_FORMATS + assert "dublincore" in tool.SUPPORTED_FORMATS + + def test_to_dict(self): + tool = SchemaExtractTool() + d = tool.to_dict() + assert d["name"] == "schema_extract" + + +class TestSchemaExtractToolGracefulDegradation: + """测试 extruct 不可用时的优雅降级""" + + @pytest.mark.asyncio + async def test_execute_without_extruct(self): + with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", False): + tool = SchemaExtractTool() + result = await tool.execute(url_or_html="") + assert result["success"] is False + assert "extruct not installed" in result["error"] + assert "pip install extruct" in result["error"] + assert result["schemas"] == [] + + +class TestSchemaExtractToolValidation: + """测试输入验证""" + + @pytest.mark.asyncio + async def test_execute_missing_url_or_html(self): + tool = SchemaExtractTool() + result = await tool.execute() + assert result["success"] is False + assert "url_or_html" in result["error"] + + @pytest.mark.asyncio + async def test_execute_empty_url_or_html(self): + tool = SchemaExtractTool() + result = await tool.execute(url_or_html="") + assert result["success"] is False + + @pytest.mark.asyncio + async def test_execute_invalid_format(self): + with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True): + tool = SchemaExtractTool() + result = await tool.execute(url_or_html="", formats=["invalid-format"]) + assert result["success"] is False + assert "不支持" in result["error"] or "invalid" in result["error"].lower() + + +class TestSchemaExtractToolWithMockedExtruct: + """使用 mock extruct 测试提取逻辑""" + + SAMPLE_HTML_WITH_JSONLD = """ + + + + + + + """ + + @pytest.mark.asyncio + async def test_extract_jsonld_from_html(self): + """测试从 HTML 中提取 JSON-LD""" + mock_extruct = MagicMock() + mock_extruct.extract.return_value = { + "json-ld": [ + {"@context": "https://schema.org", "@type": "Organization", "name": "Test Corp"} + ] + } + + with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \ + patch("agentkit.tools.schema_tools.extruct", mock_extruct): + tool = SchemaExtractTool() + result = await tool.execute(url_or_html=self.SAMPLE_HTML_WITH_JSONLD) + assert result["success"] is True + assert len(result["schemas"]) == 1 + assert result["schemas"][0]["format"] == "json-ld" + assert result["schemas"][0]["data"]["@type"] == "Organization" + assert result["schemas"][0]["data"]["name"] == "Test Corp" + + @pytest.mark.asyncio + async def test_extract_no_schema_data(self): + """测试 HTML 中没有结构化数据""" + mock_extruct = MagicMock() + mock_extruct.extract.return_value = {"json-ld": []} + + with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \ + patch("agentkit.tools.schema_tools.extruct", mock_extruct): + tool = SchemaExtractTool() + result = await tool.execute(url_or_html="No schema") + assert result["success"] is True + assert result["schemas"] == [] + + @pytest.mark.asyncio + async def test_extract_multiple_formats(self): + """测试同时提取多种格式""" + mock_extruct = MagicMock() + mock_extruct.extract.return_value = { + "json-ld": [{"@type": "Organization", "name": "Corp"}], + "microdata": [{"type": "Product", "name": "Item"}], + } + + with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \ + patch("agentkit.tools.schema_tools.extruct", mock_extruct): + tool = SchemaExtractTool() + result = await tool.execute( + url_or_html="", + formats=["json-ld", "microdata"], + ) + assert result["success"] is True + assert len(result["schemas"]) == 2 + formats_found = {s["format"] for s in result["schemas"]} + assert "json-ld" in formats_found + assert "microdata" in formats_found + + @pytest.mark.asyncio + async def test_extract_error_handling(self): + """测试提取异常处理""" + mock_extruct = MagicMock() + mock_extruct.extract.side_effect = Exception("Parse error") + + with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \ + patch("agentkit.tools.schema_tools.extruct", mock_extruct): + tool = SchemaExtractTool() + result = await tool.execute(url_or_html="") + assert result["success"] is False + assert "Parse error" in result["error"] + + @pytest.mark.asyncio + async def test_extract_with_url(self): + """测试从 URL 提取(需要先获取 HTML)""" + mock_extruct = MagicMock() + mock_extruct.extract.return_value = { + "json-ld": [{"@type": "WebPage"}] + } + + with patch("agentkit.tools.schema_tools._EXTRUCT_AVAILABLE", True), \ + patch("agentkit.tools.schema_tools.extruct", mock_extruct), \ + patch("urllib.request.urlopen") as mock_urlopen: + mock_resp = MagicMock() + mock_resp.read.return_value = b"Test" + mock_resp.__enter__ = MagicMock(return_value=mock_resp) + mock_resp.__exit__ = MagicMock(return_value=None) + mock_urlopen.return_value = mock_resp + + tool = SchemaExtractTool() + result = await tool.execute(url_or_html="https://example.com") + assert result["success"] is True + + +# ========== SchemaGenerateTool 测试 ========== + + +class TestSchemaGenerateToolConstruction: + """测试 SchemaGenerateTool 构造""" + + def test_default_construction(self): + tool = SchemaGenerateTool() + assert tool.name == "schema_generate" + assert tool.input_schema is not None + assert tool.output_schema is not None + assert "schema_type" in tool.input_schema["properties"] + assert "properties" in tool.input_schema["properties"] + + def test_supported_types(self): + tool = SchemaGenerateTool() + assert "Organization" in tool.SUPPORTED_TYPES + assert "FAQPage" in tool.SUPPORTED_TYPES + assert "Article" in tool.SUPPORTED_TYPES + assert "Product" in tool.SUPPORTED_TYPES + assert "HowTo" in tool.SUPPORTED_TYPES + assert "LocalBusiness" in tool.SUPPORTED_TYPES + assert "Person" in tool.SUPPORTED_TYPES + assert "BreadcrumbList" in tool.SUPPORTED_TYPES + assert "SiteNavigationElement" in tool.SUPPORTED_TYPES + assert "WebPage" in tool.SUPPORTED_TYPES + + +class TestSchemaGenerateToolValidation: + """测试输入验证""" + + @pytest.mark.asyncio + async def test_execute_missing_schema_type(self): + tool = SchemaGenerateTool() + result = await tool.execute(properties={"name": "Test"}) + assert result["success"] is False + assert "schema_type" in result["error"] + + @pytest.mark.asyncio + async def test_execute_missing_properties(self): + tool = SchemaGenerateTool() + result = await tool.execute(schema_type="Organization") + assert result["success"] is False + assert "properties" in result["error"] + + @pytest.mark.asyncio + async def test_execute_invalid_schema_type(self): + tool = SchemaGenerateTool() + result = await tool.execute(schema_type="InvalidType", properties={"name": "Test"}) + assert result["success"] is False + assert "不支持" in result["error"] or "InvalidType" in result["error"] + + @pytest.mark.asyncio + async def test_execute_properties_not_dict(self): + tool = SchemaGenerateTool() + result = await tool.execute(schema_type="Organization", properties="not a dict") + assert result["success"] is False + assert "字典" in result["error"] or "dict" in result["error"].lower() + + +class TestSchemaGenerateToolManualGeneration: + """测试手动 JSON-LD 生成(始终可用,无需外部依赖)""" + + @pytest.mark.asyncio + async def test_generate_organization(self): + """测试生成 Organization 类型""" + with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False): + tool = SchemaGenerateTool() + result = await tool.execute( + schema_type="Organization", + properties={"name": "Fischer AI", "url": "https://fischer.ai"}, + ) + assert result["success"] is True + assert result["schema_type"] == "Organization" + + jsonld = json.loads(result["jsonld"]) + assert jsonld["@context"] == "https://schema.org" + assert jsonld["@type"] == "Organization" + assert jsonld["name"] == "Fischer AI" + assert jsonld["url"] == "https://fischer.ai" + + @pytest.mark.asyncio + async def test_generate_faq_page(self): + """测试生成 FAQPage 类型""" + with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False): + tool = SchemaGenerateTool() + result = await tool.execute( + schema_type="FAQPage", + properties={ + "mainEntity": [ + { + "@type": "Question", + "name": "What is GEO?", + "acceptedAnswer": { + "@type": "Answer", + "text": "Generative Engine Optimization", + }, + } + ] + }, + ) + assert result["success"] is True + jsonld = json.loads(result["jsonld"]) + assert jsonld["@type"] == "FAQPage" + assert len(jsonld["mainEntity"]) == 1 + + @pytest.mark.asyncio + async def test_generate_article(self): + """测试生成 Article 类型""" + with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False): + tool = SchemaGenerateTool() + result = await tool.execute( + schema_type="Article", + properties={ + "headline": "Test Article", + "author": {"@type": "Person", "name": "John"}, + }, + ) + assert result["success"] is True + jsonld = json.loads(result["jsonld"]) + assert jsonld["@type"] == "Article" + assert jsonld["headline"] == "Test Article" + + @pytest.mark.asyncio + async def test_generate_breadcrumb_list(self): + """测试生成 BreadcrumbList 类型""" + with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False): + tool = SchemaGenerateTool() + result = await tool.execute( + schema_type="BreadcrumbList", + properties={ + "itemListElement": [ + {"@type": "ListItem", "position": 1, "name": "Home"}, + ] + }, + ) + assert result["success"] is True + jsonld = json.loads(result["jsonld"]) + assert jsonld["@type"] == "BreadcrumbList" + + @pytest.mark.asyncio + async def test_output_is_valid_jsonld(self): + """测试输出是有效的 JSON-LD(包含 @context 和 @type)""" + with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False): + tool = SchemaGenerateTool() + for schema_type in ["Organization", "WebPage", "Product", "Person"]: + result = await tool.execute( + schema_type=schema_type, + properties={"name": f"Test {schema_type}"}, + ) + assert result["success"] is True + jsonld = json.loads(result["jsonld"]) + assert "@context" in jsonld + assert jsonld["@context"] == "https://schema.org" + assert "@type" in jsonld + assert jsonld["@type"] == schema_type + + @pytest.mark.asyncio + async def test_manual_generation_preserves_chinese(self): + """测试手动生成保留中文字符""" + with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False): + tool = SchemaGenerateTool() + result = await tool.execute( + schema_type="Organization", + properties={"name": "费舍尔科技", "description": "AI 驱动的企业平台"}, + ) + assert result["success"] is True + jsonld = json.loads(result["jsonld"]) + assert jsonld["name"] == "费舍尔科技" + assert jsonld["description"] == "AI 驱动的企业平台" + + +class TestSchemaGenerateToolWithPydanticSchemaorg: + """测试 pydantic-schemaorg 可用时的行为""" + + @pytest.mark.asyncio + async def test_fallback_to_manual_when_schemaorg_fails(self): + """当 pydantic-schemaorg 构建失败时,降级到手动生成""" + mock_schemaorg = MagicMock() + # 让 getattr 返回 None,模拟类型不存在 + mock_schemaorg.Organization = None + + with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", True), \ + patch("agentkit.tools.schema_tools.pydantic_schemaorg", mock_schemaorg): + tool = SchemaGenerateTool() + result = await tool.execute( + schema_type="Organization", + properties={"name": "Test"}, + ) + # 应该降级到手动生成 + assert result["success"] is True + jsonld = json.loads(result["jsonld"]) + assert jsonld["@type"] == "Organization" + assert jsonld["name"] == "Test" + + @pytest.mark.asyncio + async def test_schemaorg_not_available_uses_manual(self): + """当 pydantic-schemaorg 不可用时,使用手动生成""" + with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False): + tool = SchemaGenerateTool() + result = await tool.execute( + schema_type="Organization", + properties={"name": "Manual Corp"}, + ) + assert result["success"] is True + jsonld = json.loads(result["jsonld"]) + assert jsonld["name"] == "Manual Corp" + + +class TestSchemaGenerateToolSafeExecute: + """测试 safe_execute 钩子""" + + @pytest.mark.asyncio + async def test_safe_execute_success(self): + with patch("agentkit.tools.schema_tools._PYDANTIC_SCHEMAORG_AVAILABLE", False): + tool = SchemaGenerateTool() + result = await tool.safe_execute( + schema_type="Organization", + properties={"name": "Test"}, + ) + assert result["success"] is True diff --git a/tests/unit/test_web_crawl_tool.py b/tests/unit/test_web_crawl_tool.py new file mode 100644 index 0000000..4b02fb9 --- /dev/null +++ b/tests/unit/test_web_crawl_tool.py @@ -0,0 +1,201 @@ +"""WebCrawlTool 单元测试""" + +import sys +import types +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from agentkit.tools.web_crawl import WebCrawlTool + + +class TestWebCrawlToolConstruction: + """测试 WebCrawlTool 构造""" + + def test_default_construction(self): + tool = WebCrawlTool() + assert tool.name == "web_crawl" + assert "抓取" in tool.description or "crawl" in tool.description.lower() + assert tool.input_schema is not None + assert tool.output_schema is not None + assert "url" in tool.input_schema["properties"] + assert tool.input_schema["required"] == ["url"] + + def test_custom_construction(self): + tool = WebCrawlTool( + name="my_crawler", + description="自定义爬虫", + version="2.0.0", + tags=["custom"], + ) + assert tool.name == "my_crawler" + assert tool.description == "自定义爬虫" + assert tool.version == "2.0.0" + assert tool.tags == ["custom"] + + def test_to_dict(self): + tool = WebCrawlTool() + d = tool.to_dict() + assert d["name"] == "web_crawl" + assert "input_schema" in d + assert "output_schema" in d + + def test_repr(self): + tool = WebCrawlTool() + r = repr(tool) + assert "WebCrawlTool" in r + assert "web_crawl" in r + + +class TestWebCrawlToolGracefulDegradation: + """测试 Crawl4AI 不可用时的优雅降级""" + + @pytest.mark.asyncio + async def test_execute_without_crawl4ai(self): + """当 Crawl4AI 未安装时,返回安装提示""" + with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", False): + tool = WebCrawlTool() + result = await tool.execute(url="https://example.com") + assert result["success"] is False + assert "Crawl4AI not installed" in result["error"] + assert "pip install crawl4ai" in result["error"] + + @pytest.mark.asyncio + async def test_safe_execute_without_crawl4ai(self): + """safe_execute 在 Crawl4AI 不可用时也应正常返回""" + with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", False): + tool = WebCrawlTool() + result = await tool.safe_execute(url="https://example.com") + assert result["success"] is False + + +class TestWebCrawlToolValidation: + """测试输入验证""" + + @pytest.mark.asyncio + async def test_execute_missing_url(self): + tool = WebCrawlTool() + result = await tool.execute() + assert result["success"] is False + assert "url" in result["error"] + + @pytest.mark.asyncio + async def test_execute_empty_url(self): + tool = WebCrawlTool() + result = await tool.execute(url="") + assert result["success"] is False + + +class TestWebCrawlToolWithMockedCrawl4AI: + """使用 mock Crawl4AI 测试正常抓取逻辑""" + + def _make_mock_crawler(self, markdown="# Hello", html="

Hello

", links=None, status_code=200): + """创建 mock AsyncWebCrawler""" + mock_result = MagicMock() + mock_result.markdown = markdown + mock_result.html = html + mock_result.links = links or ["https://example.com/page1"] + mock_result.status_code = status_code + mock_result.extracted_content = None + + mock_crawler = AsyncMock() + mock_crawler.arun = AsyncMock(return_value=mock_result) + mock_crawler.__aenter__ = AsyncMock(return_value=mock_crawler) + mock_crawler.__aexit__ = AsyncMock(return_value=None) + + return mock_crawler, mock_result + + @pytest.mark.asyncio + async def test_execute_markdown_format(self): + """测试 Markdown 格式输出""" + mock_crawler, _ = self._make_mock_crawler(markdown="# Test Page") + + with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \ + patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler): + tool = WebCrawlTool() + result = await tool.execute(url="https://example.com", format="markdown") + assert result["success"] is True + assert result["content"] == "# Test Page" + assert result["status_code"] == 200 + + @pytest.mark.asyncio + async def test_execute_html_format(self): + """测试 HTML 格式输出""" + mock_crawler, _ = self._make_mock_crawler(html="

Test

") + + with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \ + patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler): + tool = WebCrawlTool() + result = await tool.execute(url="https://example.com", format="html") + assert result["success"] is True + assert result["content"] == "

Test

" + + @pytest.mark.asyncio + async def test_execute_with_links(self): + """测试链接提取""" + mock_crawler, _ = self._make_mock_crawler(links=["https://example.com/a", "https://example.com/b"]) + + with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \ + patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler): + tool = WebCrawlTool() + result = await tool.execute(url="https://example.com") + assert result["success"] is True + assert len(result["links"]) == 2 + + @pytest.mark.asyncio + async def test_execute_with_css_selector(self): + """测试 CSS 选择器提取""" + mock_crawler, mock_result = self._make_mock_crawler() + mock_result.extracted_content = '{"title": "Test"}' + + mock_strategy_cls = MagicMock(return_value=MagicMock()) + + with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \ + patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler), \ + patch("agentkit.tools.web_crawl.JsonCssExtractionStrategy", mock_strategy_cls): + tool = WebCrawlTool() + result = await tool.execute(url="https://example.com", css_selector="h1") + assert result["success"] is True + assert "extracted" in result + mock_strategy_cls.assert_called_once_with("h1") + + @pytest.mark.asyncio + async def test_execute_with_js_wait(self): + """测试 JS 等待参数""" + mock_crawler, _ = self._make_mock_crawler() + + with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \ + patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler): + tool = WebCrawlTool() + result = await tool.execute(url="https://example.com", js_wait=2) + assert result["success"] is True + # 验证 arun 被调用时传入了 js_wait 参数 + call_kwargs = mock_crawler.arun.call_args + assert call_kwargs[1].get("js_wait") == 2 or call_kwargs[1].get("js_wait") is not None + + @pytest.mark.asyncio + async def test_execute_crawl_error(self): + """测试抓取异常处理""" + mock_crawler = AsyncMock() + mock_crawler.arun = AsyncMock(side_effect=Exception("Connection timeout")) + mock_crawler.__aenter__ = AsyncMock(return_value=mock_crawler) + mock_crawler.__aexit__ = AsyncMock(return_value=None) + + with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \ + patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler): + tool = WebCrawlTool() + result = await tool.execute(url="https://example.com") + assert result["success"] is False + assert "Connection timeout" in result["error"] + + @pytest.mark.asyncio + async def test_execute_default_format_is_markdown(self): + """测试默认输出格式为 markdown""" + mock_crawler, _ = self._make_mock_crawler(markdown="MD content", html="HTML content") + + with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \ + patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler): + tool = WebCrawlTool() + result = await tool.execute(url="https://example.com") + assert result["success"] is True + assert result["content"] == "MD content"