fischer-agentkit/tests/unit/test_gateway_cache.py

"""Integration tests for LLM Cache integration into LLMGateway (U2/U17).

U17 更新：gateway 改用 ``LitellmCacheManager``（LiteLLM 内置缓存）。
旧的 ``InMemoryLLMCache`` 手动缓存逻辑已移除，缓存读写由 LiteLLM 内部处理。
测试用 ``CacheAwareMockProvider`` 模拟 LiteLLM 的缓存行为（检查 cache_key，
命中时返回 ``cache_hit=True`` 的响应）。
"""

import pytest

from agentkit.llm.config import CacheConfig, LLMConfig
from agentkit.llm.gateway import LLMGateway
from agentkit.llm.protocol import LLMProvider, LLMRequest, LLMResponse, TokenUsage


class MockProvider(LLMProvider):
    """Mock LLM provider that tracks call count (no cache awareness)."""

    def __init__(self, response_content: str = "Mock response"):
        self.call_count = 0
        self._response_content = response_content

    async def chat(self, request: LLMRequest) -> LLMResponse:
        self.call_count += 1
        return LLMResponse(
            content=self._response_content,
            model=request.model,
            usage=TokenUsage(prompt_tokens=10, completion_tokens=20),
        )


class CacheAwareMockProvider(LLMProvider):
    """Mock provider that simulates LiteLLM's cache behavior.

    Reads ``request._cache`` for cache_key, maintains an internal cache dict.
    On cache hit: returns cached response with ``cache_hit=True`` (call_count NOT incremented).
    On cache miss: generates fresh response, caches it, returns with ``cache_hit=False``.
    """

    def __init__(self, response_content: str = "Mock response"):
        self.call_count = 0  # 仅统计真实调用（缓存未命中）
        self._response_content = response_content
        self._cache: dict[str, LLMResponse] = {}

    async def chat(self, request: LLMRequest) -> LLMResponse:
        cache_params = getattr(request, "_cache", None) or {}
        cache_key = cache_params.get("cache_key")
        no_cache = cache_params.get("no-cache", False)

        # 缓存命中 — 返回缓存响应（不增加 call_count）
        if cache_key and cache_key in self._cache and not no_cache:
            cached = self._cache[cache_key]
            return LLMResponse(
                content=cached.content,
                model=cached.model,
                usage=cached.usage,
                cache_hit=True,
            )

        # 缓存未命中 — 真实调用
        self.call_count += 1
        response = LLMResponse(
            content=self._response_content,
            model=request.model,
            usage=TokenUsage(prompt_tokens=10, completion_tokens=20),
            cache_hit=False,
        )
        if cache_key and not no_cache:
            self._cache[cache_key] = response
        return response


def _make_messages(user_content: str = "Hello") -> list[dict[str, str]]:
    return [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": user_content},
    ]


class TestCacheDisabled:
    @pytest.mark.asyncio
    async def test_no_cache_by_default(self):
        """Cache is disabled by default — requests always hit provider."""
        gateway = LLMGateway()
        provider = MockProvider()
        gateway.register_provider("test", provider)

        msgs = _make_messages()
        await gateway.chat(msgs, "test/model")
        await gateway.chat(msgs, "test/model")

        assert provider.call_count == 2


class TestCacheEnabled:
    @pytest.mark.asyncio
    async def test_first_request_is_miss(self):
        """First request is a cache miss — provider is called."""
        config = LLMConfig(cache=CacheConfig(enabled=True, backend="memory"))
        gateway = LLMGateway(config=config)
        provider = CacheAwareMockProvider()
        gateway.register_provider("test", provider)

        msgs = _make_messages()
        response = await gateway.chat(msgs, "test/model", temperature=0.0)

        assert provider.call_count == 1
        assert response.content == "Mock response"
        assert response.cache_hit is False

    @pytest.mark.asyncio
    async def test_second_request_is_hit(self):
        """Second identical request is a cache hit — provider NOT called again."""
        config = LLMConfig(cache=CacheConfig(enabled=True, backend="memory"))
        gateway = LLMGateway(config=config)
        provider = CacheAwareMockProvider()
        gateway.register_provider("test", provider)

        msgs = _make_messages()
        await gateway.chat(msgs, "test/model", temperature=0.0)
        response = await gateway.chat(msgs, "test/model", temperature=0.0)

        assert provider.call_count == 1  # Not called again (cache hit)
        assert response.content == "Mock response"
        assert response.cache_hit is True

    @pytest.mark.asyncio
    async def test_cache_hit_usage_has_zero_cost(self):
        """Cache hit records usage with cost=0."""
        config = LLMConfig(cache=CacheConfig(enabled=True, backend="memory"))
        gateway = LLMGateway(config=config)
        provider = CacheAwareMockProvider()
        gateway.register_provider("test", provider)

        msgs = _make_messages()
        await gateway.chat(msgs, "test/model", agent_name="agent1", temperature=0.0)
        await gateway.chat(msgs, "test/model", agent_name="agent1", temperature=0.0)

        usage = gateway.get_usage(agent_name="agent1")
        # First request has cost, second (cache hit) has cost=0
        assert usage.total_cost == 0.0  # No cost config, so both are 0
        assert len(usage.records) == 2

    @pytest.mark.asyncio
    async def test_different_messages_are_miss(self):
        """Different messages produce cache misses."""
        config = LLMConfig(cache=CacheConfig(enabled=True, backend="memory"))
        gateway = LLMGateway(config=config)
        provider = CacheAwareMockProvider()
        gateway.register_provider("test", provider)

        await gateway.chat(_make_messages("Hello"), "test/model", temperature=0.0)
        await gateway.chat(_make_messages("World"), "test/model", temperature=0.0)

        assert provider.call_count == 2


class TestCacheConfig:
    def test_config_from_dict(self):
        """CacheConfig can be loaded from dict."""
        config = LLMConfig.from_dict(
            {
                "cache": {
                    "enabled": True,
                    "backend": "memory",
                    "exact_ttl": 7200,
                }
            }
        )
        assert config.cache is not None
        assert config.cache.enabled is True
        assert config.cache.backend == "memory"
        assert config.cache.exact_ttl == 7200

    def test_config_from_dict_no_cache(self):
        """No cache section in config → cache is None."""
        config = LLMConfig.from_dict({})
        assert config.cache is None

    def test_config_from_dict_embedding(self):
        """Embedding config is loaded correctly."""
        config = LLMConfig.from_dict(
            {
                "cache": {
                    "enabled": True,
                    "embedding": {
                        "provider": "xinference",
                        "model": "bge-m3",
                        "base_url": "http://localhost:9997/v1",
                    },
                }
            }
        )
        assert config.cache.embedding_provider == "xinference"
        assert config.cache.embedding_model == "bge-m3"
        assert config.cache.embedding_base_url == "http://localhost:9997/v1"

    def test_gateway_creates_cache_when_enabled(self):
        """Gateway creates cache_manager instance when cache.enabled=True."""
        config = LLMConfig(cache=CacheConfig(enabled=True, backend="memory"))
        gateway = LLMGateway(config=config)
        assert gateway._cache_manager is not None

    def test_gateway_no_cache_when_disabled(self):
        """Gateway has no cache_manager when cache is disabled."""
        config = LLMConfig(cache=CacheConfig(enabled=False))
        gateway = LLMGateway(config=config)
        assert gateway._cache_manager is None

    def test_gateway_no_cache_when_no_config(self):
        """Gateway has no cache_manager when cache config is absent."""
        gateway = LLMGateway()
        assert gateway._cache_manager is None