fischer-agentkit/tests/integration/test_soul_evolution_trigger.py

416 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""集成测试 - Soul 进化触发条件
测试 EvolutionMixin.evolve_soul 的多维触发逻辑:
- 时间窗口内反思计数
- 质量梯度(下降分数)触发早期进化
- 任务类型权重调整触发阈值
- 时间衰减降低旧反思的有效计数
仅 mock MemoryTool文件 I/O使用真实 EvolutionMixin + SoulEvolutionConfig 实例。
"""
from __future__ import annotations
from datetime import datetime, timedelta, timezone
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from agentkit.core.protocol import TaskMessage, TaskResult, TaskStatus
from agentkit.evolution.lifecycle import EvolutionMixin, SoulEvolutionConfig
from agentkit.evolution.reflector import Reflection
from agentkit.memory.profile import MemoryStore
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def make_task(task_id: str = "task-1") -> TaskMessage:
return TaskMessage(
task_id=task_id,
agent_name="test_agent",
task_type="analysis",
priority=1,
input_data={"content": "test task"},
callback_url=None,
created_at=datetime.now(timezone.utc),
)
def make_result(task_id: str = "task-1") -> TaskResult:
return TaskResult(
task_id=task_id,
agent_name="test_agent",
status=TaskStatus.COMPLETED,
output_data={"result": "done"},
error_message=None,
started_at=datetime.now(timezone.utc),
completed_at=datetime.now(timezone.utc),
)
def make_reflection(
quality_score: float = 0.3,
patterns: list[str] | None = None,
suggestions: list[str] | None = None,
) -> Reflection:
# Use explicit None check to allow empty list for suggestions
if suggestions is None:
suggestions = ["Add more detail", "Be more specific"]
return Reflection(
task_id="task-1",
agent_name="test_agent",
outcome="partial",
quality_score=quality_score,
patterns=patterns or ["reasoning"],
insights=["Needs improvement"],
suggestions=suggestions,
)
def make_mock_memory_store() -> MagicMock:
"""创建 mock MemoryStore模拟 get_file 返回可操作的 MemoryFile"""
store = MagicMock(spec=MemoryStore)
mock_file = MagicMock()
mock_file.read_section.return_value = "版本: 1\n更新时间: 2025-01-01T00:00:00"
mock_file.list_sections.return_value = ["身份", "版本"]
store.get_file.return_value = mock_file
return store
# ---------------------------------------------------------------------------
# Test 1: 3 reflections within window trigger evolution
# ---------------------------------------------------------------------------
class TestReflectionCountTrigger:
"""时间窗口内 3 次反思触发进化"""
@pytest.mark.asyncio
async def test_three_reflections_trigger_evolution(self):
config = SoulEvolutionConfig(
min_reflections=3,
reflection_window_seconds=3600,
time_decay_factor=1.0, # No decay for this test
)
mixin = EvolutionMixin(evolution_config=config)
memory_store = make_mock_memory_store()
# Record 3 reflections within the window
task = make_task()
result = make_result()
with patch("agentkit.tools.memory_tool.MemoryTool.execute", new_callable=AsyncMock) as mock_execute:
mock_execute.return_value = {"success": True, "version": 2}
# First reflection — should not trigger
reflection1 = make_reflection(quality_score=0.3, patterns=["reasoning"])
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=reflection1)
assert evolved is False
# Second reflection — should not trigger
reflection2 = make_reflection(quality_score=0.25, patterns=["reasoning"])
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=reflection2)
assert evolved is False
# Third reflection — should trigger (3 >= min_reflections=3)
reflection3 = make_reflection(quality_score=0.2, patterns=["reasoning"])
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=reflection3)
assert evolved is True
# MemoryTool.execute should have been called for the soul update
mock_execute.assert_called_once()
call_kwargs = mock_execute.call_args[1]
assert call_kwargs["action"] == "update_soul"
assert call_kwargs["file"] == "soul"
@pytest.mark.asyncio
async def test_two_reflections_do_not_trigger(self):
config = SoulEvolutionConfig(
min_reflections=3,
time_decay_factor=1.0,
)
mixin = EvolutionMixin(evolution_config=config)
memory_store = make_mock_memory_store()
task = make_task()
result = make_result()
# First reflection
reflection1 = make_reflection(quality_score=0.3, patterns=["reasoning"])
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=reflection1)
assert evolved is False
# Second reflection — still not enough
reflection2 = make_reflection(quality_score=0.25, patterns=["reasoning"])
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=reflection2)
assert evolved is False
# ---------------------------------------------------------------------------
# Test 2: Quality gradient (declining scores) triggers early evolution
# ---------------------------------------------------------------------------
class TestQualityGradientTrigger:
"""质量梯度(持续下降分数)触发早期进化"""
@pytest.mark.asyncio
async def test_declining_scores_trigger_early_evolution(self):
config = SoulEvolutionConfig(
min_reflections=10, # High threshold — won't trigger by count
quality_gradient_threshold=-0.15,
time_decay_factor=1.0,
)
mixin = EvolutionMixin(evolution_config=config)
memory_store = make_mock_memory_store()
task = make_task()
result = make_result()
with patch("agentkit.tools.memory_tool.MemoryTool.execute", new_callable=AsyncMock) as mock_execute:
mock_execute.return_value = {"success": True, "version": 2}
# Record 3 reflections with declining scores (all < 0.5 to pass the quality check)
# Score drops: 0.45 → 0.25 → 0.05 (each drop > 0.15)
reflection1 = make_reflection(quality_score=0.45, patterns=["reasoning"])
await mixin.evolve_soul(task, result, memory_store, reflection=reflection1)
reflection2 = make_reflection(quality_score=0.25, patterns=["reasoning"])
await mixin.evolve_soul(task, result, memory_store, reflection=reflection2)
# Third reflection with continued decline should trigger quality gradient
reflection3 = make_reflection(quality_score=0.05, patterns=["reasoning"])
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=reflection3)
# Quality gradient: 0.45→0.25 (drop=-0.2), 0.25→0.05 (drop=-0.2)
# Both drops <= -0.15, so quality_gradient_triggered = True
assert evolved is True
mock_execute.assert_called_once()
@pytest.mark.asyncio
async def test_stable_scores_do_not_trigger_gradient(self):
config = SoulEvolutionConfig(
min_reflections=10, # High threshold
quality_gradient_threshold=-0.15,
time_decay_factor=1.0,
)
mixin = EvolutionMixin(evolution_config=config)
memory_store = make_mock_memory_store()
task = make_task()
result = make_result()
# Record 3 reflections with stable/improving scores
reflection1 = make_reflection(quality_score=0.3, patterns=["reasoning"])
await mixin.evolve_soul(task, result, memory_store, reflection=reflection1)
reflection2 = make_reflection(quality_score=0.35, patterns=["reasoning"])
await mixin.evolve_soul(task, result, memory_store, reflection=reflection2)
reflection3 = make_reflection(quality_score=0.4, patterns=["reasoning"])
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=reflection3)
# Scores are improving, no quality gradient trigger
assert evolved is False
# ---------------------------------------------------------------------------
# Test 3: Task type weight adjusts trigger threshold
# ---------------------------------------------------------------------------
class TestTaskTypeWeightTrigger:
"""任务类型权重调整触发阈值"""
@pytest.mark.asyncio
async def test_high_weight_reduces_effective_threshold(self):
"""高权重降低有效触发阈值2 次反思 × 权重 2.0 = 有效 4.0 >= min_reflections 3"""
config = SoulEvolutionConfig(
min_reflections=3,
time_decay_factor=1.0,
task_type_weights={"critical": 2.0},
)
mixin = EvolutionMixin(evolution_config=config)
memory_store = make_mock_memory_store()
task = make_task()
result = make_result()
with patch("agentkit.tools.memory_tool.MemoryTool.execute", new_callable=AsyncMock) as mock_execute:
mock_execute.return_value = {"success": True, "version": 2}
# First reflection with critical task type
reflection1 = make_reflection(quality_score=0.3, patterns=["reasoning"])
evolved = await mixin.evolve_soul(
task, result, memory_store,
reflection=reflection1,
task_type="critical",
)
# 1 reflection × weight 2.0 = 2.0 < 3
assert evolved is False
# Second reflection with critical task type
reflection2 = make_reflection(quality_score=0.25, patterns=["reasoning"])
evolved = await mixin.evolve_soul(
task, result, memory_store,
reflection=reflection2,
task_type="critical",
)
# 2 reflections × weight 2.0 = 4.0 >= 3
assert evolved is True
mock_execute.assert_called_once()
@pytest.mark.asyncio
async def test_low_weight_increases_effective_threshold(self):
"""低权重增加有效触发阈值3 次反思 × 权重 0.5 = 有效 1.5 < min_reflections 3"""
config = SoulEvolutionConfig(
min_reflections=3,
time_decay_factor=1.0,
task_type_weights={"low_priority": 0.5},
)
mixin = EvolutionMixin(evolution_config=config)
memory_store = make_mock_memory_store()
task = make_task()
result = make_result()
# 3 reflections with low_priority task type
for i in range(3):
reflection = make_reflection(quality_score=0.3, patterns=["reasoning"])
evolved = await mixin.evolve_soul(
task, result, memory_store,
reflection=reflection,
task_type="low_priority",
)
# 3 × 0.5 = 1.5 < 3 → should not trigger
assert evolved is False
# ---------------------------------------------------------------------------
# Test 4: Time decay reduces effective count for old reflections
# ---------------------------------------------------------------------------
class TestTimeDecayReducesEffectiveCount:
"""时间衰减降低旧反思的有效计数"""
@pytest.mark.asyncio
async def test_old_reflections_decay_below_threshold(self):
"""旧反思因时间衰减导致有效计数不足"""
config = SoulEvolutionConfig(
min_reflections=3,
reflection_window_seconds=3600,
time_decay_factor=0.5, # Half-life of 1 hour
)
mixin = EvolutionMixin(evolution_config=config)
memory_store = make_mock_memory_store()
task = make_task()
result = make_result()
# Manually add old reflections to pending_soul_updates
now = datetime.now(timezone.utc)
old_timestamp = now - timedelta(hours=3) # 3 hours ago
# Add 2 old reflections manually
mixin.pending_soul_updates["reasoning"] = [
{
"reflection": make_reflection(quality_score=0.3),
"timestamp": old_timestamp,
"score": 0.3,
"task_type": "",
},
{
"reflection": make_reflection(quality_score=0.25),
"timestamp": old_timestamp,
"score": 0.25,
"task_type": "",
},
]
# Add a recent reflection via evolve_soul
# Time decay: 0.5^3 = 0.125 per old reflection → 2 × 0.125 = 0.25
# Plus 1 new reflection → total effective ≈ 1.25 < 3
recent_reflection = make_reflection(quality_score=0.2, patterns=["reasoning"])
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=recent_reflection)
# Effective count should be well below 3 due to decay
assert evolved is False
@pytest.mark.asyncio
async def test_recent_reflections_no_decay(self):
"""近期反思不受时间衰减影响"""
config = SoulEvolutionConfig(
min_reflections=3,
time_decay_factor=0.5,
)
mixin = EvolutionMixin(evolution_config=config)
memory_store = make_mock_memory_store()
task = make_task()
result = make_result()
with patch("agentkit.tools.memory_tool.MemoryTool.execute", new_callable=AsyncMock) as mock_execute:
mock_execute.return_value = {"success": True, "version": 2}
# 3 recent reflections should trigger (no significant decay)
for i in range(3):
reflection = make_reflection(quality_score=0.3, patterns=["reasoning"])
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=reflection)
assert evolved is True
@pytest.mark.asyncio
async def test_no_memory_store_returns_false(self):
"""无 MemoryStore 时不触发进化"""
config = SoulEvolutionConfig(min_reflections=1)
mixin = EvolutionMixin(evolution_config=config)
task = make_task()
result = make_result()
reflection = make_reflection(quality_score=0.3, patterns=["reasoning"])
evolved = await mixin.evolve_soul(task, result, memory_store=None, reflection=reflection)
assert evolved is False
@pytest.mark.asyncio
async def test_high_quality_reflection_does_not_trigger(self):
"""高质量反思不触发进化quality_score >= 0.5"""
config = SoulEvolutionConfig(min_reflections=1)
mixin = EvolutionMixin(evolution_config=config)
memory_store = make_mock_memory_store()
task = make_task()
result = make_result()
# High quality reflection — should not even be recorded
reflection = make_reflection(quality_score=0.8, patterns=["reasoning"])
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=reflection)
assert evolved is False
@pytest.mark.asyncio
async def test_no_suggestions_does_not_trigger(self):
"""无建议的反思不触发进化"""
config = SoulEvolutionConfig(min_reflections=1)
mixin = EvolutionMixin(evolution_config=config)
memory_store = make_mock_memory_store()
task = make_task()
result = make_result()
# Low quality but no suggestions
reflection = make_reflection(quality_score=0.3, patterns=["reasoning"], suggestions=[])
evolved = await mixin.evolve_soul(task, result, memory_store, reflection=reflection)
assert evolved is False