fischer-agentkit/tests/unit/test_web_crawl_tool.py

"""WebCrawlTool 单元测试"""

import sys
import types
from unittest.mock import AsyncMock, MagicMock, patch

import pytest

from agentkit.tools.web_crawl import WebCrawlTool


class TestWebCrawlToolConstruction:
    """测试 WebCrawlTool 构造"""

    def test_default_construction(self):
        tool = WebCrawlTool()
        assert tool.name == "web_crawl"
        assert "抓取" in tool.description or "crawl" in tool.description.lower()
        assert tool.input_schema is not None
        assert tool.output_schema is not None
        assert "url" in tool.input_schema["properties"]
        assert tool.input_schema["required"] == ["url"]

    def test_custom_construction(self):
        tool = WebCrawlTool(
            name="my_crawler",
            description="自定义爬虫",
            version="2.0.0",
            tags=["custom"],
        )
        assert tool.name == "my_crawler"
        assert tool.description == "自定义爬虫"
        assert tool.version == "2.0.0"
        assert tool.tags == ["custom"]

    def test_to_dict(self):
        tool = WebCrawlTool()
        d = tool.to_dict()
        assert d["name"] == "web_crawl"
        assert "input_schema" in d
        assert "output_schema" in d

    def test_repr(self):
        tool = WebCrawlTool()
        r = repr(tool)
        assert "WebCrawlTool" in r
        assert "web_crawl" in r


class TestWebCrawlToolGracefulDegradation:
    """测试 Crawl4AI 不可用时的优雅降级"""

    @pytest.mark.asyncio
    async def test_execute_without_crawl4ai(self):
        """当 Crawl4AI 未安装时，返回安装提示"""
        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", False):
            tool = WebCrawlTool()
            result = await tool.execute(url="https://example.com")
            assert result["success"] is False
            assert "Crawl4AI not installed" in result["error"]
            assert "pip install crawl4ai" in result["error"]

    @pytest.mark.asyncio
    async def test_safe_execute_without_crawl4ai(self):
        """safe_execute 在 Crawl4AI 不可用时也应正常返回"""
        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", False):
            tool = WebCrawlTool()
            result = await tool.safe_execute(url="https://example.com")
            assert result["success"] is False


class TestWebCrawlToolValidation:
    """测试输入验证"""

    @pytest.mark.asyncio
    async def test_execute_missing_url(self):
        tool = WebCrawlTool()
        result = await tool.execute()
        assert result["success"] is False
        assert "url" in result["error"]

    @pytest.mark.asyncio
    async def test_execute_empty_url(self):
        tool = WebCrawlTool()
        result = await tool.execute(url="")
        assert result["success"] is False


class TestWebCrawlToolWithMockedCrawl4AI:
    """使用 mock Crawl4AI 测试正常抓取逻辑"""

    def _make_mock_crawler(self, markdown="# Hello", html="<h1>Hello</h1>", links=None, status_code=200):
        """创建 mock AsyncWebCrawler"""
        mock_result = MagicMock()
        mock_result.markdown = markdown
        mock_result.html = html
        mock_result.links = links or ["https://example.com/page1"]
        mock_result.status_code = status_code
        mock_result.extracted_content = None

        mock_crawler = AsyncMock()
        mock_crawler.arun = AsyncMock(return_value=mock_result)
        mock_crawler.__aenter__ = AsyncMock(return_value=mock_crawler)
        mock_crawler.__aexit__ = AsyncMock(return_value=None)

        return mock_crawler, mock_result

    @pytest.mark.asyncio
    async def test_execute_markdown_format(self):
        """测试 Markdown 格式输出"""
        mock_crawler, _ = self._make_mock_crawler(markdown="# Test Page")

        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
            tool = WebCrawlTool()
            result = await tool.execute(url="https://example.com", format="markdown")
            assert result["success"] is True
            assert result["content"] == "# Test Page"
            assert result["status_code"] == 200

    @pytest.mark.asyncio
    async def test_execute_html_format(self):
        """测试 HTML 格式输出"""
        mock_crawler, _ = self._make_mock_crawler(html="<h1>Test</h1>")

        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
            tool = WebCrawlTool()
            result = await tool.execute(url="https://example.com", format="html")
            assert result["success"] is True
            assert result["content"] == "<h1>Test</h1>"

    @pytest.mark.asyncio
    async def test_execute_with_links(self):
        """测试链接提取"""
        mock_crawler, _ = self._make_mock_crawler(links=["https://example.com/a", "https://example.com/b"])

        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
            tool = WebCrawlTool()
            result = await tool.execute(url="https://example.com")
            assert result["success"] is True
            assert len(result["links"]) == 2

    @pytest.mark.asyncio
    async def test_execute_with_css_selector(self):
        """测试 CSS 选择器提取"""
        mock_crawler, mock_result = self._make_mock_crawler()
        mock_result.extracted_content = '{"title": "Test"}'

        mock_strategy_cls = MagicMock(return_value=MagicMock())

        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler), \
             patch("agentkit.tools.web_crawl.JsonCssExtractionStrategy", mock_strategy_cls):
            tool = WebCrawlTool()
            result = await tool.execute(url="https://example.com", css_selector="h1")
            assert result["success"] is True
            assert "extracted" in result
            mock_strategy_cls.assert_called_once_with("h1")

    @pytest.mark.asyncio
    async def test_execute_with_js_wait(self):
        """测试 JS 等待参数"""
        mock_crawler, _ = self._make_mock_crawler()

        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
            tool = WebCrawlTool()
            result = await tool.execute(url="https://example.com", js_wait=2)
            assert result["success"] is True
            # 验证 arun 被调用时传入了 js_wait 参数
            call_kwargs = mock_crawler.arun.call_args
            assert call_kwargs[1].get("js_wait") == 2 or call_kwargs[1].get("js_wait") is not None

    @pytest.mark.asyncio
    async def test_execute_crawl_error(self):
        """测试抓取异常处理"""
        mock_crawler = AsyncMock()
        mock_crawler.arun = AsyncMock(side_effect=Exception("Connection timeout"))
        mock_crawler.__aenter__ = AsyncMock(return_value=mock_crawler)
        mock_crawler.__aexit__ = AsyncMock(return_value=None)

        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
            tool = WebCrawlTool()
            result = await tool.execute(url="https://example.com")
            assert result["success"] is False
            assert "Connection timeout" in result["error"]

    @pytest.mark.asyncio
    async def test_execute_default_format_is_markdown(self):
        """测试默认输出格式为 markdown"""
        mock_crawler, _ = self._make_mock_crawler(markdown="MD content", html="HTML content")

        with patch("agentkit.tools.web_crawl._CRAWL4AI_AVAILABLE", True), \
             patch("agentkit.tools.web_crawl.AsyncWebCrawler", return_value=mock_crawler):
            tool = WebCrawlTool()
            result = await tool.execute(url="https://example.com")
            assert result["success"] is True
            assert result["content"] == "MD content"