fischer-agentkit/tests/unit/test_soul_evolution.py

553 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Tests for U8: Soul Dynamic Evolution — SOUL 动态进化与版本追踪."""
from __future__ import annotations
from datetime import datetime, timedelta, timezone
from pathlib import Path
from unittest.mock import AsyncMock
import pytest
from agentkit.core.protocol import TaskMessage, TaskResult, TaskStatus
from agentkit.evolution.lifecycle import EvolutionMixin, SoulEvolutionConfig
from agentkit.evolution.reflector import Reflection, Reflector
from agentkit.memory.profile import MemoryStore
from agentkit.tools.memory_tool import MemoryTool
# ── Helpers ──────────────────────────────────────────────────
@pytest.fixture
def store(tmp_path: Path) -> MemoryStore:
return MemoryStore(base_dir=tmp_path)
@pytest.fixture
def tool(store: MemoryStore) -> MemoryTool:
return MemoryTool(memory_store=store)
def _make_task(task_id: str = "test-001") -> TaskMessage:
return TaskMessage(
task_id=task_id,
agent_name="evolving_agent",
task_type="echo",
priority=0,
input_data={"query": "hello"},
callback_url=None,
created_at=datetime.now(timezone.utc),
)
def _make_result(status: str = TaskStatus.COMPLETED) -> TaskResult:
return TaskResult(
task_id="test-001",
agent_name="evolving_agent",
status=status,
output_data={"key": "value"},
error_message=None,
started_at=datetime.now(timezone.utc),
completed_at=datetime.now(timezone.utc),
metrics={"elapsed_seconds": 5.0},
)
class LowQualityReflector(Reflector):
"""总是产生低质量结果和改进建议的 Reflector."""
async def reflect(self, task, result):
return Reflection(
task_id=task.task_id,
agent_name=result.agent_name,
outcome="failure",
quality_score=0.2,
patterns=["slow_execution"],
insights=["Low quality score indicates potential issues"],
suggestions=["Consider prompt optimization for this task type"],
)
class HighQualityReflector(Reflector):
"""总是产生高质量结果的 Reflector."""
async def reflect(self, task, result):
return Reflection(
task_id=task.task_id,
agent_name=result.agent_name,
outcome="success",
quality_score=0.8,
patterns=["fast_execution"],
insights=[],
suggestions=[],
)
class LowQualityNoSuggestionsReflector(Reflector):
"""低质量但没有建议的 Reflector."""
async def reflect(self, task, result):
return Reflection(
task_id=task.task_id,
agent_name=result.agent_name,
outcome="failure",
quality_score=0.2,
patterns=["slow_execution"],
insights=["Low quality"],
suggestions=[],
)
# ── MemoryTool update_soul action 测试 ──────────────────────
class TestMemoryToolUpdateSoul:
"""MemoryTool update_soul 操作测试."""
async def test_basic_update_increments_version(self, tool: MemoryTool, store: MemoryStore):
"""基本更新会递增版本号."""
# 初始化 SOUL
store.get_file("soul").write("## 身份\n我是助手")
result = await tool.execute(
action="update_soul",
file="soul",
section="性格",
content="更加耐心",
)
assert result["success"] is True
assert result["version"] == 2
# 验证版本 section
version_content = store.get_file("soul").read_section("版本")
assert "版本: 2" in version_content
async def test_creates_version_section_if_missing(self, tool: MemoryTool, store: MemoryStore):
"""如果不存在版本 section 则创建."""
store.get_file("soul").write("## 身份\n我是助手")
result = await tool.execute(
action="update_soul",
file="soul",
section="性格",
content="友好",
)
assert result["success"] is True
assert result["version"] == 2
# 版本 section 应该存在
sections = store.get_file("soul").list_sections()
assert "版本" in sections
async def test_adds_update_history_entry(self, tool: MemoryTool, store: MemoryStore):
"""更新历史条目被正确添加."""
store.get_file("soul").write("## 身份\n我是助手")
result = await tool.execute(
action="update_soul",
file="soul",
section="性格",
content="更加耐心",
reason="用户反馈需要更耐心",
)
assert result["success"] is True
history_content = store.get_file("soul").read_section("更新历史")
assert "v2" in history_content
assert "性格" in history_content
assert "用户反馈需要更耐心" in history_content
async def test_history_limited_to_10_entries(self, tool: MemoryTool, store: MemoryStore):
"""更新历史最多保留 10 条."""
store.get_file("soul").write("## 身份\n我是助手")
# 执行 12 次更新
for i in range(12):
result = await tool.execute(
action="update_soul",
file="soul",
section=f"section_{i}",
content=f"content_{i}",
)
assert result["success"] is True
history_content = store.get_file("soul").read_section("更新历史")
lines = [line for line in history_content.strip().split("\n") if line.strip()]
assert len(lines) <= 10
async def test_requires_section_and_content(self, tool: MemoryTool, store: MemoryStore):
"""缺少 section 或 content 时返回错误."""
store.get_file("soul").write("## 身份\n我是助手")
# 缺少 section
result = await tool.execute(
action="update_soul",
file="soul",
content="内容",
)
assert result["success"] is False
assert "section" in result.get("error", "").lower()
# 缺少 content
result = await tool.execute(
action="update_soul",
file="soul",
section="性格",
)
assert result["success"] is False
assert "content" in result.get("error", "").lower()
async def test_invalid_action_still_rejected(self, tool: MemoryTool):
"""无效 action 仍然被拒绝."""
result = await tool.execute(action="delete_everything", file="soul")
assert result["success"] is False
assert "Unknown action" in result.get("error", "")
# ── EvolutionMixin.evolve_soul 测试 ──────────────────────────
class TestEvolveSoul:
"""EvolutionMixin.evolve_soul 测试."""
async def test_no_update_when_fewer_than_3_reflections(self, store: MemoryStore):
"""少于 3 次同类反思时不触发 soul 更新."""
reflector = LowQualityReflector()
mixin = EvolutionMixin(reflector=reflector)
task = _make_task()
result = _make_result()
# 只调用 2 次,不够 3 次阈值
for _ in range(2):
updated = await mixin.evolve_soul(task, result, memory_store=store)
assert updated is False
async def test_triggers_update_when_3_same_category_reflections(self, store: MemoryStore):
"""同类反思累积 >= 3 次时触发 soul 更新."""
reflector = LowQualityReflector()
mixin = EvolutionMixin(reflector=reflector)
task = _make_task()
result = _make_result()
# 前 2 次不触发
for _ in range(2):
updated = await mixin.evolve_soul(task, result, memory_store=store)
assert updated is False
# 第 3 次触发
updated = await mixin.evolve_soul(task, result, memory_store=store)
assert updated is True
# 验证 SOUL 被更新了
soul_content = store.get_file("soul").read()
assert "slow_execution" in soul_content
async def test_no_update_without_memory_store(self):
"""没有 memory_store 时不触发更新."""
reflector = LowQualityReflector()
mixin = EvolutionMixin(reflector=reflector)
task = _make_task()
result = _make_result()
updated = await mixin.evolve_soul(task, result, memory_store=None)
assert updated is False
async def test_no_update_when_quality_score_above_threshold(self, store: MemoryStore):
"""quality_score >= 0.5 时不触发更新."""
reflector = HighQualityReflector()
mixin = EvolutionMixin(reflector=reflector)
task = _make_task()
result = _make_result()
updated = await mixin.evolve_soul(task, result, memory_store=store)
assert updated is False
# ── Multi-dimensional trigger tests ──────────────────────────
class TestTimeDecay:
"""时间衰减触发测试."""
async def test_recent_reflections_count_fully(self, store: MemoryStore):
"""窗口内的反思完全计入有效数量."""
config = SoulEvolutionConfig(
min_reflections=3,
reflection_window_seconds=3600,
time_decay_factor=0.5,
)
reflector = LowQualityReflector()
mixin = EvolutionMixin(reflector=reflector, evolution_config=config)
task = _make_task()
result = _make_result()
# 3 次近期反思应触发
for _ in range(2):
updated = await mixin.evolve_soul(task, result, memory_store=store)
assert updated is False
updated = await mixin.evolve_soul(task, result, memory_store=store)
assert updated is True
async def test_old_reflections_decay(self, store: MemoryStore):
"""旧反思因时间衰减导致有效数量不足,不触发."""
config = SoulEvolutionConfig(
min_reflections=3,
reflection_window_seconds=3600,
time_decay_factor=0.5,
)
reflector = LowQualityReflector()
mixin = EvolutionMixin(reflector=reflector, evolution_config=config)
task = _make_task()
result = _make_result()
# 手动插入 2 个旧反思10 小时前)
old_time = datetime.now(timezone.utc) - timedelta(hours=10)
for _ in range(2):
mixin.pending_soul_updates.setdefault("slow_execution", []).append(
{
"reflection": Reflection(
task_id="old",
agent_name="evolving_agent",
outcome="failure",
quality_score=0.2,
patterns=["slow_execution"],
insights=[],
suggestions=["Improve speed"],
),
"timestamp": old_time,
"score": 0.2,
"task_type": "",
}
)
# 1 个新反思2*0.5^10 + 1 ≈ 1.002 < 3不触发
updated = await mixin.evolve_soul(task, result, memory_store=store)
assert updated is False
class TestQualityGradient:
"""质量梯度触发测试."""
async def test_declining_scores_trigger_early(self, store: MemoryStore):
"""连续 3 次分数下降超过阈值时提前触发."""
config = SoulEvolutionConfig(
min_reflections=3,
quality_gradient_threshold=-0.15,
)
reflector = LowQualityReflector()
mixin = EvolutionMixin(reflector=reflector, evolution_config=config)
task = _make_task()
result = _make_result()
# 手动插入 2 个反思,分数递减
mixin.pending_soul_updates.setdefault("slow_execution", []).append(
{
"reflection": Reflection(
task_id="g1",
agent_name="evolving_agent",
outcome="failure",
quality_score=0.4,
patterns=["slow_execution"],
insights=[],
suggestions=["Improve"],
),
"timestamp": datetime.now(timezone.utc),
"score": 0.4,
"task_type": "",
}
)
mixin.pending_soul_updates["slow_execution"].append(
{
"reflection": Reflection(
task_id="g2",
agent_name="evolving_agent",
outcome="failure",
quality_score=0.2,
patterns=["slow_execution"],
insights=[],
suggestions=["Improve more"],
),
"timestamp": datetime.now(timezone.utc),
"score": 0.2,
"task_type": "",
}
)
# 第 3 个反思score=0.0,下降 0.2 > 0.15 阈值,触发
updated = await mixin.evolve_soul(
task, result, memory_store=store, score=0.0
)
assert updated is True
async def test_stable_scores_do_not_trigger_early(self, store: MemoryStore):
"""分数稳定时不提前触发."""
config = SoulEvolutionConfig(
min_reflections=3,
quality_gradient_threshold=-0.15,
)
reflector = LowQualityReflector()
mixin = EvolutionMixin(reflector=reflector, evolution_config=config)
task = _make_task()
result = _make_result()
# 2 个分数稳定的反思
mixin.pending_soul_updates.setdefault("slow_execution", []).append(
{
"reflection": Reflection(
task_id="s1",
agent_name="evolving_agent",
outcome="failure",
quality_score=0.2,
patterns=["slow_execution"],
insights=[],
suggestions=["Improve"],
),
"timestamp": datetime.now(timezone.utc),
"score": 0.2,
"task_type": "",
}
)
mixin.pending_soul_updates["slow_execution"].append(
{
"reflection": Reflection(
task_id="s2",
agent_name="evolving_agent",
outcome="failure",
quality_score=0.19,
patterns=["slow_execution"],
insights=[],
suggestions=["Improve more"],
),
"timestamp": datetime.now(timezone.utc),
"score": 0.19,
"task_type": "",
}
)
# 第 3 个反思:下降 0.01 < 0.15 阈值,不提前触发
# 但 effective_count=3 >= min_reflections=3所以仍会触发
# 需要改为只有 2 个反思来测试"不提前触发"
mixin2 = EvolutionMixin(reflector=reflector, evolution_config=config)
mixin2.pending_soul_updates.setdefault("slow_execution", []).append(
{
"reflection": Reflection(
task_id="s1",
agent_name="evolving_agent",
outcome="failure",
quality_score=0.2,
patterns=["slow_execution"],
insights=[],
suggestions=["Improve"],
),
"timestamp": datetime.now(timezone.utc),
"score": 0.2,
"task_type": "",
}
)
# 只有 2 个反思,分数稳定,不应触发
updated = await mixin2.evolve_soul(
task, result, memory_store=store, score=0.19
)
assert updated is False
class TestTaskTypeWeight:
"""任务类型权重触发测试."""
async def test_code_generation_triggers_at_2(self, store: MemoryStore):
"""code_generation 类型权重 1.52 次反思即可触发 (2*1.5=3 >= 3)."""
config = SoulEvolutionConfig(
min_reflections=3,
task_type_weights={"code_generation": 1.5, "chat": 0.5},
)
reflector = LowQualityReflector()
mixin = EvolutionMixin(reflector=reflector, evolution_config=config)
task = _make_task()
result = _make_result()
# 第 1 次
updated = await mixin.evolve_soul(
task, result, memory_store=store, task_type="code_generation"
)
assert updated is False
# 第 2 次effective_count=2, weight=1.5, weighted=3.0 >= 3触发
updated = await mixin.evolve_soul(
task, result, memory_store=store, task_type="code_generation"
)
assert updated is True
async def test_chat_needs_more_reflections(self, store: MemoryStore):
"""chat 类型权重 0.5,需要更多反思才能触发."""
config = SoulEvolutionConfig(
min_reflections=3,
task_type_weights={"code_generation": 1.5, "chat": 0.5},
)
reflector = LowQualityReflector()
mixin = EvolutionMixin(reflector=reflector, evolution_config=config)
task = _make_task()
result = _make_result()
# 4 次 chat 反思effective_count=4, weight=0.5, weighted=2.0 < 3不触发
for _ in range(4):
updated = await mixin.evolve_soul(
task, result, memory_store=store, task_type="chat"
)
assert updated is False
# 第 5 次触发5 * 0.5 = 2.5 < 3仍不触发
updated = await mixin.evolve_soul(
task, result, memory_store=store, task_type="chat"
)
assert updated is False
# 第 6 次6 * 0.5 = 3.0 >= 3触发
updated = await mixin.evolve_soul(
task, result, memory_store=store, task_type="chat"
)
assert updated is True
class TestBackwardCompatibility:
"""向后兼容性测试:无 config 时行为与之前一致."""
async def test_no_config_3_reflections_trigger(self, store: MemoryStore):
"""无 evolution_config 时3 次反思触发(与原行为一致)."""
reflector = LowQualityReflector()
mixin = EvolutionMixin(reflector=reflector)
task = _make_task()
result = _make_result()
# 前 2 次不触发
for _ in range(2):
updated = await mixin.evolve_soul(task, result, memory_store=store)
assert updated is False
# 第 3 次触发
updated = await mixin.evolve_soul(task, result, memory_store=store)
assert updated is True
async def test_no_config_fewer_than_3_no_trigger(self, store: MemoryStore):
"""无 evolution_config 时,少于 3 次不触发."""
reflector = LowQualityReflector()
mixin = EvolutionMixin(reflector=reflector)
task = _make_task()
result = _make_result()
for _ in range(2):
updated = await mixin.evolve_soul(task, result, memory_store=store)
assert updated is False