416 lines
16 KiB
Python
416 lines
16 KiB
Python
"""集成测试 - Soul 进化触发条件
|
||
|
||
测试 EvolutionMixin.evolve_soul 的多维触发逻辑:
|
||
- 时间窗口内反思计数
|
||
- 质量梯度(下降分数)触发早期进化
|
||
- 任务类型权重调整触发阈值
|
||
- 时间衰减降低旧反思的有效计数
|
||
|
||
仅 mock MemoryTool(文件 I/O),使用真实 EvolutionMixin + SoulEvolutionConfig 实例。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
from datetime import datetime, timedelta, timezone
|
||
from unittest.mock import AsyncMock, MagicMock, patch
|
||
|
||
import pytest
|
||
|
||
from agentkit.core.protocol import TaskMessage, TaskResult, TaskStatus
|
||
from agentkit.evolution.lifecycle import EvolutionMixin, SoulEvolutionConfig
|
||
from agentkit.evolution.reflector import Reflection
|
||
from agentkit.memory.profile import MemoryStore
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def make_task(task_id: str = "task-1") -> TaskMessage:
|
||
return TaskMessage(
|
||
task_id=task_id,
|
||
agent_name="test_agent",
|
||
task_type="analysis",
|
||
priority=1,
|
||
input_data={"content": "test task"},
|
||
callback_url=None,
|
||
created_at=datetime.now(timezone.utc),
|
||
)
|
||
|
||
|
||
def make_result(task_id: str = "task-1") -> TaskResult:
|
||
return TaskResult(
|
||
task_id=task_id,
|
||
agent_name="test_agent",
|
||
status=TaskStatus.COMPLETED,
|
||
output_data={"result": "done"},
|
||
error_message=None,
|
||
started_at=datetime.now(timezone.utc),
|
||
completed_at=datetime.now(timezone.utc),
|
||
)
|
||
|
||
|
||
def make_reflection(
|
||
quality_score: float = 0.3,
|
||
patterns: list[str] | None = None,
|
||
suggestions: list[str] | None = None,
|
||
) -> Reflection:
|
||
# Use explicit None check to allow empty list for suggestions
|
||
if suggestions is None:
|
||
suggestions = ["Add more detail", "Be more specific"]
|
||
return Reflection(
|
||
task_id="task-1",
|
||
agent_name="test_agent",
|
||
outcome="partial",
|
||
quality_score=quality_score,
|
||
patterns=patterns or ["reasoning"],
|
||
insights=["Needs improvement"],
|
||
suggestions=suggestions,
|
||
)
|
||
|
||
|
||
def make_mock_memory_store() -> MagicMock:
|
||
"""创建 mock MemoryStore,模拟 get_file 返回可操作的 MemoryFile"""
|
||
store = MagicMock(spec=MemoryStore)
|
||
mock_file = MagicMock()
|
||
mock_file.read_section.return_value = "版本: 1\n更新时间: 2025-01-01T00:00:00"
|
||
mock_file.list_sections.return_value = ["身份", "版本"]
|
||
store.get_file.return_value = mock_file
|
||
return store
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test 1: 3 reflections within window trigger evolution
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestReflectionCountTrigger:
|
||
"""时间窗口内 3 次反思触发进化"""
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_three_reflections_trigger_evolution(self):
|
||
config = SoulEvolutionConfig(
|
||
min_reflections=3,
|
||
reflection_window_seconds=3600,
|
||
time_decay_factor=1.0, # No decay for this test
|
||
)
|
||
|
||
mixin = EvolutionMixin(evolution_config=config)
|
||
memory_store = make_mock_memory_store()
|
||
|
||
# Record 3 reflections within the window
|
||
task = make_task()
|
||
result = make_result()
|
||
|
||
with patch("agentkit.tools.memory_tool.MemoryTool.execute", new_callable=AsyncMock) as mock_execute:
|
||
mock_execute.return_value = {"success": True, "version": 2}
|
||
|
||
# First reflection — should not trigger
|
||
reflection1 = make_reflection(quality_score=0.3, patterns=["reasoning"])
|
||
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=reflection1)
|
||
assert evolved is False
|
||
|
||
# Second reflection — should not trigger
|
||
reflection2 = make_reflection(quality_score=0.25, patterns=["reasoning"])
|
||
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=reflection2)
|
||
assert evolved is False
|
||
|
||
# Third reflection — should trigger (3 >= min_reflections=3)
|
||
reflection3 = make_reflection(quality_score=0.2, patterns=["reasoning"])
|
||
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=reflection3)
|
||
assert evolved is True
|
||
|
||
# MemoryTool.execute should have been called for the soul update
|
||
mock_execute.assert_called_once()
|
||
call_kwargs = mock_execute.call_args[1]
|
||
assert call_kwargs["action"] == "update_soul"
|
||
assert call_kwargs["file"] == "soul"
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_two_reflections_do_not_trigger(self):
|
||
config = SoulEvolutionConfig(
|
||
min_reflections=3,
|
||
time_decay_factor=1.0,
|
||
)
|
||
|
||
mixin = EvolutionMixin(evolution_config=config)
|
||
memory_store = make_mock_memory_store()
|
||
|
||
task = make_task()
|
||
result = make_result()
|
||
|
||
# First reflection
|
||
reflection1 = make_reflection(quality_score=0.3, patterns=["reasoning"])
|
||
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=reflection1)
|
||
assert evolved is False
|
||
|
||
# Second reflection — still not enough
|
||
reflection2 = make_reflection(quality_score=0.25, patterns=["reasoning"])
|
||
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=reflection2)
|
||
assert evolved is False
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test 2: Quality gradient (declining scores) triggers early evolution
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestQualityGradientTrigger:
|
||
"""质量梯度(持续下降分数)触发早期进化"""
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_declining_scores_trigger_early_evolution(self):
|
||
config = SoulEvolutionConfig(
|
||
min_reflections=10, # High threshold — won't trigger by count
|
||
quality_gradient_threshold=-0.15,
|
||
time_decay_factor=1.0,
|
||
)
|
||
|
||
mixin = EvolutionMixin(evolution_config=config)
|
||
memory_store = make_mock_memory_store()
|
||
|
||
task = make_task()
|
||
result = make_result()
|
||
|
||
with patch("agentkit.tools.memory_tool.MemoryTool.execute", new_callable=AsyncMock) as mock_execute:
|
||
mock_execute.return_value = {"success": True, "version": 2}
|
||
|
||
# Record 3 reflections with declining scores (all < 0.5 to pass the quality check)
|
||
# Score drops: 0.45 → 0.25 → 0.05 (each drop > 0.15)
|
||
reflection1 = make_reflection(quality_score=0.45, patterns=["reasoning"])
|
||
await mixin.evolve_soul(task, result, memory_store, reflection=reflection1)
|
||
|
||
reflection2 = make_reflection(quality_score=0.25, patterns=["reasoning"])
|
||
await mixin.evolve_soul(task, result, memory_store, reflection=reflection2)
|
||
|
||
# Third reflection with continued decline should trigger quality gradient
|
||
reflection3 = make_reflection(quality_score=0.05, patterns=["reasoning"])
|
||
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=reflection3)
|
||
|
||
# Quality gradient: 0.45→0.25 (drop=-0.2), 0.25→0.05 (drop=-0.2)
|
||
# Both drops <= -0.15, so quality_gradient_triggered = True
|
||
assert evolved is True
|
||
mock_execute.assert_called_once()
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_stable_scores_do_not_trigger_gradient(self):
|
||
config = SoulEvolutionConfig(
|
||
min_reflections=10, # High threshold
|
||
quality_gradient_threshold=-0.15,
|
||
time_decay_factor=1.0,
|
||
)
|
||
|
||
mixin = EvolutionMixin(evolution_config=config)
|
||
memory_store = make_mock_memory_store()
|
||
|
||
task = make_task()
|
||
result = make_result()
|
||
|
||
# Record 3 reflections with stable/improving scores
|
||
reflection1 = make_reflection(quality_score=0.3, patterns=["reasoning"])
|
||
await mixin.evolve_soul(task, result, memory_store, reflection=reflection1)
|
||
|
||
reflection2 = make_reflection(quality_score=0.35, patterns=["reasoning"])
|
||
await mixin.evolve_soul(task, result, memory_store, reflection=reflection2)
|
||
|
||
reflection3 = make_reflection(quality_score=0.4, patterns=["reasoning"])
|
||
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=reflection3)
|
||
|
||
# Scores are improving, no quality gradient trigger
|
||
assert evolved is False
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test 3: Task type weight adjusts trigger threshold
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestTaskTypeWeightTrigger:
|
||
"""任务类型权重调整触发阈值"""
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_high_weight_reduces_effective_threshold(self):
|
||
"""高权重降低有效触发阈值:2 次反思 × 权重 2.0 = 有效 4.0 >= min_reflections 3"""
|
||
config = SoulEvolutionConfig(
|
||
min_reflections=3,
|
||
time_decay_factor=1.0,
|
||
task_type_weights={"critical": 2.0},
|
||
)
|
||
|
||
mixin = EvolutionMixin(evolution_config=config)
|
||
memory_store = make_mock_memory_store()
|
||
|
||
task = make_task()
|
||
result = make_result()
|
||
|
||
with patch("agentkit.tools.memory_tool.MemoryTool.execute", new_callable=AsyncMock) as mock_execute:
|
||
mock_execute.return_value = {"success": True, "version": 2}
|
||
|
||
# First reflection with critical task type
|
||
reflection1 = make_reflection(quality_score=0.3, patterns=["reasoning"])
|
||
evolved = await mixin.evolve_soul(
|
||
task, result, memory_store,
|
||
reflection=reflection1,
|
||
task_type="critical",
|
||
)
|
||
# 1 reflection × weight 2.0 = 2.0 < 3
|
||
assert evolved is False
|
||
|
||
# Second reflection with critical task type
|
||
reflection2 = make_reflection(quality_score=0.25, patterns=["reasoning"])
|
||
evolved = await mixin.evolve_soul(
|
||
task, result, memory_store,
|
||
reflection=reflection2,
|
||
task_type="critical",
|
||
)
|
||
# 2 reflections × weight 2.0 = 4.0 >= 3
|
||
assert evolved is True
|
||
mock_execute.assert_called_once()
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_low_weight_increases_effective_threshold(self):
|
||
"""低权重增加有效触发阈值:3 次反思 × 权重 0.5 = 有效 1.5 < min_reflections 3"""
|
||
config = SoulEvolutionConfig(
|
||
min_reflections=3,
|
||
time_decay_factor=1.0,
|
||
task_type_weights={"low_priority": 0.5},
|
||
)
|
||
|
||
mixin = EvolutionMixin(evolution_config=config)
|
||
memory_store = make_mock_memory_store()
|
||
|
||
task = make_task()
|
||
result = make_result()
|
||
|
||
# 3 reflections with low_priority task type
|
||
for i in range(3):
|
||
reflection = make_reflection(quality_score=0.3, patterns=["reasoning"])
|
||
evolved = await mixin.evolve_soul(
|
||
task, result, memory_store,
|
||
reflection=reflection,
|
||
task_type="low_priority",
|
||
)
|
||
# 3 × 0.5 = 1.5 < 3 → should not trigger
|
||
assert evolved is False
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test 4: Time decay reduces effective count for old reflections
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestTimeDecayReducesEffectiveCount:
|
||
"""时间衰减降低旧反思的有效计数"""
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_old_reflections_decay_below_threshold(self):
|
||
"""旧反思因时间衰减导致有效计数不足"""
|
||
config = SoulEvolutionConfig(
|
||
min_reflections=3,
|
||
reflection_window_seconds=3600,
|
||
time_decay_factor=0.5, # Half-life of 1 hour
|
||
)
|
||
|
||
mixin = EvolutionMixin(evolution_config=config)
|
||
memory_store = make_mock_memory_store()
|
||
|
||
task = make_task()
|
||
result = make_result()
|
||
|
||
# Manually add old reflections to pending_soul_updates
|
||
now = datetime.now(timezone.utc)
|
||
old_timestamp = now - timedelta(hours=3) # 3 hours ago
|
||
|
||
# Add 2 old reflections manually
|
||
mixin.pending_soul_updates["reasoning"] = [
|
||
{
|
||
"reflection": make_reflection(quality_score=0.3),
|
||
"timestamp": old_timestamp,
|
||
"score": 0.3,
|
||
"task_type": "",
|
||
},
|
||
{
|
||
"reflection": make_reflection(quality_score=0.25),
|
||
"timestamp": old_timestamp,
|
||
"score": 0.25,
|
||
"task_type": "",
|
||
},
|
||
]
|
||
|
||
# Add a recent reflection via evolve_soul
|
||
# Time decay: 0.5^3 = 0.125 per old reflection → 2 × 0.125 = 0.25
|
||
# Plus 1 new reflection → total effective ≈ 1.25 < 3
|
||
recent_reflection = make_reflection(quality_score=0.2, patterns=["reasoning"])
|
||
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=recent_reflection)
|
||
|
||
# Effective count should be well below 3 due to decay
|
||
assert evolved is False
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_recent_reflections_no_decay(self):
|
||
"""近期反思不受时间衰减影响"""
|
||
config = SoulEvolutionConfig(
|
||
min_reflections=3,
|
||
time_decay_factor=0.5,
|
||
)
|
||
|
||
mixin = EvolutionMixin(evolution_config=config)
|
||
memory_store = make_mock_memory_store()
|
||
|
||
task = make_task()
|
||
result = make_result()
|
||
|
||
with patch("agentkit.tools.memory_tool.MemoryTool.execute", new_callable=AsyncMock) as mock_execute:
|
||
mock_execute.return_value = {"success": True, "version": 2}
|
||
|
||
# 3 recent reflections should trigger (no significant decay)
|
||
for i in range(3):
|
||
reflection = make_reflection(quality_score=0.3, patterns=["reasoning"])
|
||
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=reflection)
|
||
|
||
assert evolved is True
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_no_memory_store_returns_false(self):
|
||
"""无 MemoryStore 时不触发进化"""
|
||
config = SoulEvolutionConfig(min_reflections=1)
|
||
mixin = EvolutionMixin(evolution_config=config)
|
||
|
||
task = make_task()
|
||
result = make_result()
|
||
reflection = make_reflection(quality_score=0.3, patterns=["reasoning"])
|
||
|
||
evolved = await mixin.evolve_soul(task, result, memory_store=None, reflection=reflection)
|
||
assert evolved is False
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_high_quality_reflection_does_not_trigger(self):
|
||
"""高质量反思不触发进化(quality_score >= 0.5)"""
|
||
config = SoulEvolutionConfig(min_reflections=1)
|
||
mixin = EvolutionMixin(evolution_config=config)
|
||
memory_store = make_mock_memory_store()
|
||
|
||
task = make_task()
|
||
result = make_result()
|
||
|
||
# High quality reflection — should not even be recorded
|
||
reflection = make_reflection(quality_score=0.8, patterns=["reasoning"])
|
||
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=reflection)
|
||
assert evolved is False
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_no_suggestions_does_not_trigger(self):
|
||
"""无建议的反思不触发进化"""
|
||
config = SoulEvolutionConfig(min_reflections=1)
|
||
mixin = EvolutionMixin(evolution_config=config)
|
||
memory_store = make_mock_memory_store()
|
||
|
||
task = make_task()
|
||
result = make_result()
|
||
|
||
# Low quality but no suggestions
|
||
reflection = make_reflection(quality_score=0.3, patterns=["reasoning"], suggestions=[])
|
||
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=reflection)
|
||
assert evolved is False
|