fischer-agentkit/tests/unit/test_web_crawl_tool.py

202 lines
8.0 KiB
Python

"""WebCrawlTool 单元测试"""
import sys
import types
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from agentkit.tools.web_crawl import WebCrawlTool
class TestWebCrawlToolConstruction:
"""测试 WebCrawlTool 构造"""
def test_default_construction(self):
tool = WebCrawlTool()
assert tool.name == "web_crawl"
assert "抓取" in tool.description or "crawl" in tool.description.lower()
assert tool.input_schema is not None
assert tool.output_schema is not None
assert "url" in tool.input_schema["properties"]
assert tool.input_schema["required"] == ["url"]
def test_custom_construction(self):
tool = WebCrawlTool(
name="my_crawler",
description="自定义爬虫",
version="2.0.0",
tags=["custom"],
)
assert tool.name == "my_crawler"
assert tool.description == "自定义爬虫"
assert tool.version == "2.0.0"
assert tool.tags == ["custom"]
def test_to_dict(self):
tool = WebCrawlTool()
d = tool.to_dict()
assert d["name"] == "web_crawl"
assert "input_schema" in d
assert "output_schema" in d
def test_repr(self):
tool = WebCrawlTool()
r = repr(tool)
assert "WebCrawlTool" in r
assert "web_crawl" in r
class TestWebCrawlToolGracefulDegradation:
"""测试 Crawl4AI 不可用时的优雅降级"""
@pytest.mark.asyncio
async def test_execute_without_crawl4ai(self):
"""当 Crawl4AI 未安装时,返回安装提示"""
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", False):
tool = WebCrawlTool()
result = await tool.execute(url="https://example.com")
assert result["success"] is False
assert "Crawl4AI not installed" in result["error"]
assert "pip install crawl4ai" in result["error"]
@pytest.mark.asyncio
async def test_safe_execute_without_crawl4ai(self):
"""safe_execute 在 Crawl4AI 不可用时也应正常返回"""
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", False):
tool = WebCrawlTool()
result = await tool.safe_execute(url="https://example.com")
assert result["success"] is False
class TestWebCrawlToolValidation:
"""测试输入验证"""
@pytest.mark.asyncio
async def test_execute_missing_url(self):
tool = WebCrawlTool()
result = await tool.execute()
assert result["success"] is False
assert "url" in result["error"]
@pytest.mark.asyncio
async def test_execute_empty_url(self):
tool = WebCrawlTool()
result = await tool.execute(url="")
assert result["success"] is False
class TestWebCrawlToolWithMockedCrawl4AI:
"""使用 mock Crawl4AI 测试正常抓取逻辑"""
def _make_mock_crawler(self, markdown="# Hello", html="<h1>Hello</h1>", links=None, status_code=200):
"""创建 mock AsyncWebCrawler"""
mock_result = MagicMock()
mock_result.markdown = markdown
mock_result.html = html
mock_result.links = links or ["https://example.com/page1"]
mock_result.status_code = status_code
mock_result.extracted_content = None
mock_crawler = AsyncMock()
mock_crawler.arun = AsyncMock(return_value=mock_result)
mock_crawler.__aenter__ = AsyncMock(return_value=mock_crawler)
mock_crawler.__aexit__ = AsyncMock(return_value=None)
return mock_crawler, mock_result
@pytest.mark.asyncio
async def test_execute_markdown_format(self):
"""测试 Markdown 格式输出"""
mock_crawler, _ = self._make_mock_crawler(markdown="# Test Page")
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
tool = WebCrawlTool()
result = await tool.execute(url="https://example.com", format="markdown")
assert result["success"] is True
assert result["content"] == "# Test Page"
assert result["status_code"] == 200
@pytest.mark.asyncio
async def test_execute_html_format(self):
"""测试 HTML 格式输出"""
mock_crawler, _ = self._make_mock_crawler(html="<h1>Test</h1>")
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
tool = WebCrawlTool()
result = await tool.execute(url="https://example.com", format="html")
assert result["success"] is True
assert result["content"] == "<h1>Test</h1>"
@pytest.mark.asyncio
async def test_execute_with_links(self):
"""测试链接提取"""
mock_crawler, _ = self._make_mock_crawler(links=["https://example.com/a", "https://example.com/b"])
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
tool = WebCrawlTool()
result = await tool.execute(url="https://example.com")
assert result["success"] is True
assert len(result["links"]) == 2
@pytest.mark.asyncio
async def test_execute_with_css_selector(self):
"""测试 CSS 选择器提取"""
mock_crawler, mock_result = self._make_mock_crawler()
mock_result.extracted_content = '{"title": "Test"}'
mock_strategy_cls = MagicMock(return_value=MagicMock())
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler), \
patch("agentkit.tools.web_crawl.JsonCssExtractionStrategy", mock_strategy_cls):
tool = WebCrawlTool()
result = await tool.execute(url="https://example.com", css_selector="h1")
assert result["success"] is True
assert "extracted" in result
mock_strategy_cls.assert_called_once_with("h1")
@pytest.mark.asyncio
async def test_execute_with_js_wait(self):
"""测试 JS 等待参数"""
mock_crawler, _ = self._make_mock_crawler()
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
tool = WebCrawlTool()
result = await tool.execute(url="https://example.com", js_wait=2)
assert result["success"] is True
# 验证 arun 被调用时传入了 js_wait 参数
call_kwargs = mock_crawler.arun.call_args
assert call_kwargs[1].get("js_wait") == 2 or call_kwargs[1].get("js_wait") is not None
@pytest.mark.asyncio
async def test_execute_crawl_error(self):
"""测试抓取异常处理"""
mock_crawler = AsyncMock()
mock_crawler.arun = AsyncMock(side_effect=Exception("Connection timeout"))
mock_crawler.__aenter__ = AsyncMock(return_value=mock_crawler)
mock_crawler.__aexit__ = AsyncMock(return_value=None)
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
tool = WebCrawlTool()
result = await tool.execute(url="https://example.com")
assert result["success"] is False
assert "Connection timeout" in result["error"]
@pytest.mark.asyncio
async def test_execute_default_format_is_markdown(self):
"""测试默认输出格式为 markdown"""
mock_crawler, _ = self._make_mock_crawler(markdown="MD content", html="HTML content")
with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
tool = WebCrawlTool()
result = await tool.execute(url="https://example.com")
assert result["success"] is True
assert result["content"] == "MD content"