202 lines
8.0 KiB
Python
202 lines
8.0 KiB
Python
"""WebCrawlTool 单元测试"""
|
|
|
|
import sys
|
|
import types
|
|
from unittest.mock import AsyncMock, MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from agentkit.tools.web_crawl import WebCrawlTool
|
|
|
|
|
|
class TestWebCrawlToolConstruction:
|
|
"""测试 WebCrawlTool 构造"""
|
|
|
|
def test_default_construction(self):
|
|
tool = WebCrawlTool()
|
|
assert tool.name == "web_crawl"
|
|
assert "抓取" in tool.description or "crawl" in tool.description.lower()
|
|
assert tool.input_schema is not None
|
|
assert tool.output_schema is not None
|
|
assert "url" in tool.input_schema["properties"]
|
|
assert tool.input_schema["required"] == ["url"]
|
|
|
|
def test_custom_construction(self):
|
|
tool = WebCrawlTool(
|
|
name="my_crawler",
|
|
description="自定义爬虫",
|
|
version="2.0.0",
|
|
tags=["custom"],
|
|
)
|
|
assert tool.name == "my_crawler"
|
|
assert tool.description == "自定义爬虫"
|
|
assert tool.version == "2.0.0"
|
|
assert tool.tags == ["custom"]
|
|
|
|
def test_to_dict(self):
|
|
tool = WebCrawlTool()
|
|
d = tool.to_dict()
|
|
assert d["name"] == "web_crawl"
|
|
assert "input_schema" in d
|
|
assert "output_schema" in d
|
|
|
|
def test_repr(self):
|
|
tool = WebCrawlTool()
|
|
r = repr(tool)
|
|
assert "WebCrawlTool" in r
|
|
assert "web_crawl" in r
|
|
|
|
|
|
class TestWebCrawlToolGracefulDegradation:
|
|
"""测试 Crawl4AI 不可用时的优雅降级"""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_without_crawl4ai(self):
|
|
"""当 Crawl4AI 未安装时,返回安装提示"""
|
|
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", False):
|
|
tool = WebCrawlTool()
|
|
result = await tool.execute(url="https://example.com")
|
|
assert result["success"] is False
|
|
assert "Crawl4AI not installed" in result["error"]
|
|
assert "pip install crawl4ai" in result["error"]
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_safe_execute_without_crawl4ai(self):
|
|
"""safe_execute 在 Crawl4AI 不可用时也应正常返回"""
|
|
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", False):
|
|
tool = WebCrawlTool()
|
|
result = await tool.safe_execute(url="https://example.com")
|
|
assert result["success"] is False
|
|
|
|
|
|
class TestWebCrawlToolValidation:
|
|
"""测试输入验证"""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_missing_url(self):
|
|
tool = WebCrawlTool()
|
|
result = await tool.execute()
|
|
assert result["success"] is False
|
|
assert "url" in result["error"]
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_empty_url(self):
|
|
tool = WebCrawlTool()
|
|
result = await tool.execute(url="")
|
|
assert result["success"] is False
|
|
|
|
|
|
class TestWebCrawlToolWithMockedCrawl4AI:
|
|
"""使用 mock Crawl4AI 测试正常抓取逻辑"""
|
|
|
|
def _make_mock_crawler(self, markdown="# Hello", html="<h1>Hello</h1>", links=None, status_code=200):
|
|
"""创建 mock AsyncWebCrawler"""
|
|
mock_result = MagicMock()
|
|
mock_result.markdown = markdown
|
|
mock_result.html = html
|
|
mock_result.links = links or ["https://example.com/page1"]
|
|
mock_result.status_code = status_code
|
|
mock_result.extracted_content = None
|
|
|
|
mock_crawler = AsyncMock()
|
|
mock_crawler.arun = AsyncMock(return_value=mock_result)
|
|
mock_crawler.__aenter__ = AsyncMock(return_value=mock_crawler)
|
|
mock_crawler.__aexit__ = AsyncMock(return_value=None)
|
|
|
|
return mock_crawler, mock_result
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_markdown_format(self):
|
|
"""测试 Markdown 格式输出"""
|
|
mock_crawler, _ = self._make_mock_crawler(markdown="# Test Page")
|
|
|
|
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
|
|
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
|
|
tool = WebCrawlTool()
|
|
result = await tool.execute(url="https://example.com", format="markdown")
|
|
assert result["success"] is True
|
|
assert result["content"] == "# Test Page"
|
|
assert result["status_code"] == 200
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_html_format(self):
|
|
"""测试 HTML 格式输出"""
|
|
mock_crawler, _ = self._make_mock_crawler(html="<h1>Test</h1>")
|
|
|
|
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
|
|
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
|
|
tool = WebCrawlTool()
|
|
result = await tool.execute(url="https://example.com", format="html")
|
|
assert result["success"] is True
|
|
assert result["content"] == "<h1>Test</h1>"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_with_links(self):
|
|
"""测试链接提取"""
|
|
mock_crawler, _ = self._make_mock_crawler(links=["https://example.com/a", "https://example.com/b"])
|
|
|
|
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
|
|
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
|
|
tool = WebCrawlTool()
|
|
result = await tool.execute(url="https://example.com")
|
|
assert result["success"] is True
|
|
assert len(result["links"]) == 2
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_with_css_selector(self):
|
|
"""测试 CSS 选择器提取"""
|
|
mock_crawler, mock_result = self._make_mock_crawler()
|
|
mock_result.extracted_content = '{"title": "Test"}'
|
|
|
|
mock_strategy_cls = MagicMock(return_value=MagicMock())
|
|
|
|
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
|
|
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler), \
|
|
patch("agentkit.tools.web_crawl.JsonCssExtractionStrategy", mock_strategy_cls):
|
|
tool = WebCrawlTool()
|
|
result = await tool.execute(url="https://example.com", css_selector="h1")
|
|
assert result["success"] is True
|
|
assert "extracted" in result
|
|
mock_strategy_cls.assert_called_once_with("h1")
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_with_js_wait(self):
|
|
"""测试 JS 等待参数"""
|
|
mock_crawler, _ = self._make_mock_crawler()
|
|
|
|
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
|
|
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
|
|
tool = WebCrawlTool()
|
|
result = await tool.execute(url="https://example.com", js_wait=2)
|
|
assert result["success"] is True
|
|
# 验证 arun 被调用时传入了 js_wait 参数
|
|
call_kwargs = mock_crawler.arun.call_args
|
|
assert call_kwargs[1].get("js_wait") == 2 or call_kwargs[1].get("js_wait") is not None
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_crawl_error(self):
|
|
"""测试抓取异常处理"""
|
|
mock_crawler = AsyncMock()
|
|
mock_crawler.arun = AsyncMock(side_effect=Exception("Connection timeout"))
|
|
mock_crawler.__aenter__ = AsyncMock(return_value=mock_crawler)
|
|
mock_crawler.__aexit__ = AsyncMock(return_value=None)
|
|
|
|
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
|
|
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
|
|
tool = WebCrawlTool()
|
|
result = await tool.execute(url="https://example.com")
|
|
assert result["success"] is False
|
|
assert "Connection timeout" in result["error"]
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_execute_default_format_is_markdown(self):
|
|
"""测试默认输出格式为 markdown"""
|
|
mock_crawler, _ = self._make_mock_crawler(markdown="MD content", html="HTML content")
|
|
|
|
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
|
|
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
|
|
tool = WebCrawlTool()
|
|
result = await tool.execute(url="https://example.com")
|
|
assert result["success"] is True
|
|
assert result["content"] == "MD content"
|