feat(evolution): auto-trigger + quality gate + actor marking (U6, R5/R6)
U6 of the complex task quality loop plan. R5 (auto evolution trigger + quality gate): - EvolutionConfig (Pydantic v2): success_sample_rate=0.1, min_confidence=0.5, min_examples=3, observe_only=True, cross_workspace_sharing=False - Success path gated by success_sample_rate; failure path always runs (100%) - Observe-only mode records reflections without feeding optimizer (RV14: avoids noise-driven prompt degradation during initial rollout) - PromptOptimizer.can_optimize() consumption gate: sample count >= min_examples AND mean quality >= min_confidence - PitfallDetector confidence threshold: low-confidence warnings marked observe-only; confidence = failure_rate * min(1.0, total/3) linear ramp (ponytail: upgrade to Wilson interval) R6 (actor marking + cross-workspace sharing): - All evolution artifacts (EvolutionLogEntry, Module, PitfallWarning) carry actor field; defaults to result.agent_name - can_share_artifact(): same-workspace always allowed; cross-workspace requires explicit opt-in via EvolutionConfig.cross_workspace_sharing=True KTD-8: gave_up_after_reflections treated as failure path (triggers 100% evolution) even when stream wrapper marks status as COMPLETED. Detection via output_data.trace_outcome or error_message substring (ponytail: heuristic; upgrade path is a dedicated TaskResult.trace_outcome field). Backward compat: all gates conditional on auto_evolution_config is not None; existing EvolutionMixin usage without config preserves prior behavior. Tests: tests/unit/test_evolution_auto_trigger.py (37 tests) covers R5/R6 scenarios - sample rate gate, observe-only, consumption gate, pitfall confidence, actor marking, cross-workspace sharing, gave_up_after_reflections, error handling, fire-and-forget, backpressure cap, AE3 happy path.
This commit is contained in:
parent
1d09fafec9
commit
91a61f9b49
|
|
@ -11,6 +11,7 @@ from agentkit.evolution.prompt_optimizer import (
|
||||||
)
|
)
|
||||||
from agentkit.evolution.strategy_tuner import StrategyTuner
|
from agentkit.evolution.strategy_tuner import StrategyTuner
|
||||||
from agentkit.evolution.ab_tester import ABTester
|
from agentkit.evolution.ab_tester import ABTester
|
||||||
|
from agentkit.evolution.config import EvolutionConfig
|
||||||
from agentkit.evolution.evolution_store import (
|
from agentkit.evolution.evolution_store import (
|
||||||
EvolutionStore,
|
EvolutionStore,
|
||||||
EvolutionStoreProtocol,
|
EvolutionStoreProtocol,
|
||||||
|
|
@ -30,6 +31,7 @@ __all__ = [
|
||||||
"Module",
|
"Module",
|
||||||
"StrategyTuner",
|
"StrategyTuner",
|
||||||
"ABTester",
|
"ABTester",
|
||||||
|
"EvolutionConfig",
|
||||||
"EvolutionStore",
|
"EvolutionStore",
|
||||||
"EvolutionStoreProtocol",
|
"EvolutionStoreProtocol",
|
||||||
"PersistentEvolutionStore",
|
"PersistentEvolutionStore",
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,43 @@
|
||||||
|
"""EvolutionConfig - auto-evolution trigger configuration (U6, R5/R6).
|
||||||
|
|
||||||
|
R5: success sample rate gates success-path evolution at evolve_after_task() entry;
|
||||||
|
failure path always runs (100%). Quality gates (min_confidence, min_examples)
|
||||||
|
prevent noise-driven prompt degradation.
|
||||||
|
R6: actor marking on all evolution artifacts; cross-workspace sharing defaults off.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pydantic import BaseModel, ConfigDict, Field
|
||||||
|
|
||||||
|
|
||||||
|
class EvolutionConfig(BaseModel):
|
||||||
|
"""Configuration for auto-evolution triggering and quality gates.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
success_sample_rate: Fraction of success-path tasks that trigger evolution
|
||||||
|
(``random.random() < rate``). Failure path always runs (100%).
|
||||||
|
Default 0.1 — 1 in 10 successful tasks feed evolution.
|
||||||
|
min_confidence: Minimum confidence for pitfall ingestion and optimizer
|
||||||
|
consumption. Low-confidence pitfalls are marked observe-only.
|
||||||
|
min_examples: Minimum sample count before PromptOptimizer may consume
|
||||||
|
them. Pairs with min_confidence as a two-part consumption gate.
|
||||||
|
observe_only: When True, reflections/examples are recorded but NOT fed
|
||||||
|
to the optimizer. Avoids noise-driven prompt degradation (RV14)
|
||||||
|
during initial rollout. Set False once signal quality is validated.
|
||||||
|
cross_workspace_sharing: When False (default), evolution artifacts
|
||||||
|
(pitfalls, optimized prompts) are NOT shared across agent/expert
|
||||||
|
workspaces. Same-workspace sharing is always on. Cross-workspace
|
||||||
|
requires explicit opt-in (R6 trust boundary).
|
||||||
|
actor_marking: When True, stamp the producing agent/expert identity on
|
||||||
|
all evolution artifacts for traceability (R6).
|
||||||
|
"""
|
||||||
|
|
||||||
|
model_config = ConfigDict(extra="forbid")
|
||||||
|
|
||||||
|
success_sample_rate: float = Field(default=0.1, ge=0.0, le=1.0)
|
||||||
|
min_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
|
||||||
|
min_examples: int = Field(default=3, ge=1)
|
||||||
|
observe_only: bool = True
|
||||||
|
cross_workspace_sharing: bool = False
|
||||||
|
actor_marking: bool = True
|
||||||
|
|
@ -4,14 +4,16 @@
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import random
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from sqlalchemy.exc import DBAPIError
|
from sqlalchemy.exc import DBAPIError
|
||||||
|
|
||||||
from agentkit.core.protocol import EvolutionEvent, TaskMessage, TaskResult
|
from agentkit.core.protocol import EvolutionEvent, TaskMessage, TaskResult, TaskStatus
|
||||||
from agentkit.evolution.ab_tester import ABTestConfig, ABTestResult, ABTester
|
from agentkit.evolution.ab_tester import ABTestConfig, ABTestResult, ABTester
|
||||||
|
from agentkit.evolution.config import EvolutionConfig
|
||||||
from agentkit.evolution.evolution_store import EvolutionStore
|
from agentkit.evolution.evolution_store import EvolutionStore
|
||||||
from agentkit.evolution.llm_reflector import LLMReflector
|
from agentkit.evolution.llm_reflector import LLMReflector
|
||||||
from agentkit.evolution.prompt_optimizer import (
|
from agentkit.evolution.prompt_optimizer import (
|
||||||
|
|
@ -39,6 +41,7 @@ class SoulEvolutionConfig:
|
||||||
@dataclass
|
@dataclass
|
||||||
class EvolutionLogEntry:
|
class EvolutionLogEntry:
|
||||||
"""进化日志条目"""
|
"""进化日志条目"""
|
||||||
|
|
||||||
task_id: str
|
task_id: str
|
||||||
reflection: Reflection | None = None
|
reflection: Reflection | None = None
|
||||||
optimized_module: Module | None = None
|
optimized_module: Module | None = None
|
||||||
|
|
@ -47,6 +50,12 @@ class EvolutionLogEntry:
|
||||||
rolled_back: bool = False
|
rolled_back: bool = False
|
||||||
event_id: str | None = None
|
event_id: str | None = None
|
||||||
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
|
||||||
|
# R6: actor marking — which agent/expert produced this evolution artifact
|
||||||
|
actor: str = ""
|
||||||
|
# R5: whether this entry was gated by the success sample rate
|
||||||
|
sampled: bool = True
|
||||||
|
# R5: observe-only entries record but do not mutate prompts
|
||||||
|
observe_only: bool = False
|
||||||
|
|
||||||
|
|
||||||
class EvolutionMixin:
|
class EvolutionMixin:
|
||||||
|
|
@ -73,15 +82,14 @@ class EvolutionMixin:
|
||||||
auxiliary_model: str | None = None,
|
auxiliary_model: str | None = None,
|
||||||
strategy_tuning_enabled: bool = False,
|
strategy_tuning_enabled: bool = False,
|
||||||
evolution_config: SoulEvolutionConfig | None = None,
|
evolution_config: SoulEvolutionConfig | None = None,
|
||||||
|
auto_evolution_config: EvolutionConfig | None = None,
|
||||||
):
|
):
|
||||||
if reflector is not EvolutionMixin._UNSET:
|
if reflector is not EvolutionMixin._UNSET:
|
||||||
# 显式传入了 reflector 参数(包括 None)
|
# 显式传入了 reflector 参数(包括 None)
|
||||||
self._reflector = reflector
|
self._reflector = reflector
|
||||||
elif reflector_type is not None:
|
elif reflector_type is not None:
|
||||||
# 未传入 reflector,但指定了 reflector_type → 自动创建
|
# 未传入 reflector,但指定了 reflector_type → 自动创建
|
||||||
self._reflector = self._create_reflector(
|
self._reflector = self._create_reflector(reflector_type, llm_gateway, auxiliary_model)
|
||||||
reflector_type, llm_gateway, auxiliary_model
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
# 都未指定:保持向后兼容,reflector 为 None
|
# 都未指定:保持向后兼容,reflector 为 None
|
||||||
self._reflector = None
|
self._reflector = None
|
||||||
|
|
@ -93,6 +101,8 @@ class EvolutionMixin:
|
||||||
self._current_module: Module | None = None
|
self._current_module: Module | None = None
|
||||||
self._strategy_tuning_enabled = strategy_tuning_enabled
|
self._strategy_tuning_enabled = strategy_tuning_enabled
|
||||||
self._evolution_config = evolution_config
|
self._evolution_config = evolution_config
|
||||||
|
# U6/R5/R6: auto-evolution config (sample rate, quality gates, actor marking)
|
||||||
|
self._auto_evolution_config = auto_evolution_config
|
||||||
self.pending_soul_updates: dict[str, list] = {}
|
self.pending_soul_updates: dict[str, list] = {}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
@ -133,19 +143,43 @@ class EvolutionMixin:
|
||||||
task: TaskMessage,
|
task: TaskMessage,
|
||||||
result: TaskResult,
|
result: TaskResult,
|
||||||
memory_store: MemoryStore | None = None,
|
memory_store: MemoryStore | None = None,
|
||||||
|
actor: str | None = None,
|
||||||
) -> EvolutionLogEntry:
|
) -> EvolutionLogEntry:
|
||||||
"""任务完成后执行进化流程。
|
"""任务完成后执行进化流程。
|
||||||
|
|
||||||
流程:
|
流程:
|
||||||
1. Reflector 反思 → 得到 Reflection
|
1. R5 成功采样门控(仅 auto_evolution_config 配置时生效)
|
||||||
2. Soul 进化检查(如果 memory_store 可用)
|
2. Reflector 反思 → 得到 Reflection
|
||||||
3. 如果 Reflection 有改进建议 → PromptOptimizer 优化
|
3. Soul 进化检查(如果 memory_store 可用)
|
||||||
4. 如果优化产生了新 Prompt → ABTester 验证
|
4. 如果 Reflection 有改进建议 → PromptOptimizer 优化
|
||||||
5. 如果 AB 测试通过 → EvolutionStore 应用变更
|
5. 如果优化产生了新 Prompt → ABTester 验证
|
||||||
6. 如果 AB 测试失败 → 回滚
|
6. 如果 AB 测试通过 → EvolutionStore 应用变更
|
||||||
7. 如果策略调优启用 → StrategyTuner 调优
|
7. 如果 AB 测试失败 → 回滚
|
||||||
|
8. 如果策略调优启用 → StrategyTuner 调优
|
||||||
|
|
||||||
|
R5: 成功路径按 success_sample_rate 采样;失败路径始终执行(100%)。
|
||||||
|
R6: 所有进化产物携带 actor 标记。
|
||||||
|
KTD-8: gave_up_after_reflections 视为失败路径。
|
||||||
"""
|
"""
|
||||||
log_entry = EvolutionLogEntry(task_id=task.task_id)
|
# R6: actor marking — defaults to the agent that produced the result
|
||||||
|
resolved_actor = actor or result.agent_name or ""
|
||||||
|
log_entry = EvolutionLogEntry(task_id=task.task_id, actor=resolved_actor)
|
||||||
|
|
||||||
|
cfg = self._auto_evolution_config
|
||||||
|
|
||||||
|
# R5: success sample rate gate — only when auto_evolution_config is set.
|
||||||
|
# Failure path always runs (100%). KTD-8: gave_up_after_reflections = failure.
|
||||||
|
is_failure = self._is_failure_path(result)
|
||||||
|
if cfg is not None and not is_failure:
|
||||||
|
if random.random() >= cfg.success_sample_rate:
|
||||||
|
logger.debug(
|
||||||
|
"Success-path evolution skipped for task %s (sample rate %.2f)",
|
||||||
|
task.task_id,
|
||||||
|
cfg.success_sample_rate,
|
||||||
|
)
|
||||||
|
log_entry.sampled = False
|
||||||
|
self._evolution_log.append(log_entry)
|
||||||
|
return log_entry
|
||||||
|
|
||||||
# Step 1: 反思
|
# Step 1: 反思
|
||||||
if self._reflector is None:
|
if self._reflector is None:
|
||||||
|
|
@ -177,16 +211,46 @@ class EvolutionMixin:
|
||||||
self._evolution_log.append(log_entry)
|
self._evolution_log.append(log_entry)
|
||||||
return log_entry
|
return log_entry
|
||||||
|
|
||||||
|
# R5: observe-only mode — record reflection but do NOT feed optimizer.
|
||||||
|
# Avoids noise-driven prompt degradation during initial rollout (RV14).
|
||||||
|
if cfg is not None and cfg.observe_only:
|
||||||
|
logger.debug(
|
||||||
|
"Observe-only mode: recording reflection without feeding optimizer for task %s",
|
||||||
|
task.task_id,
|
||||||
|
)
|
||||||
|
log_entry.observe_only = True
|
||||||
|
self._evolution_log.append(log_entry)
|
||||||
|
return log_entry
|
||||||
|
|
||||||
|
# R5: consumption gate — sample count >= min_examples AND confidence达标.
|
||||||
|
min_conf = cfg.min_confidence if cfg is not None else 0.5
|
||||||
|
min_examples = cfg.min_examples if cfg is not None else 3
|
||||||
|
if hasattr(self._prompt_optimizer, "can_optimize"):
|
||||||
|
if not self._prompt_optimizer.can_optimize(
|
||||||
|
min_confidence=min_conf, min_examples=min_examples
|
||||||
|
):
|
||||||
|
logger.debug(
|
||||||
|
"Optimizer consumption gate not met for task %s, skipping optimization",
|
||||||
|
task.task_id,
|
||||||
|
)
|
||||||
|
self._evolution_log.append(log_entry)
|
||||||
|
return log_entry
|
||||||
|
|
||||||
# 将反思结果作为训练样本
|
# 将反思结果作为训练样本
|
||||||
self._prompt_optimizer.add_example(
|
self._prompt_optimizer.add_example(
|
||||||
input_data=task.input_data,
|
input_data=task.input_data,
|
||||||
output_data=result.output_data or {},
|
output_data=result.output_data or {},
|
||||||
quality_score=reflection.quality_score,
|
quality_score=reflection.quality_score,
|
||||||
|
actor=resolved_actor,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Pass trace and reflection to LLMPromptOptimizer if available
|
# Pass trace and reflection to LLMPromptOptimizer if available
|
||||||
optimized = await self._optimize_with_context(self._current_module, reflection)
|
optimized = await self._optimize_with_context(self._current_module, reflection)
|
||||||
|
|
||||||
|
# R6: stamp actor on optimized module
|
||||||
|
if cfg is None or cfg.actor_marking:
|
||||||
|
optimized.actor = resolved_actor
|
||||||
|
|
||||||
# 检查是否真正产生了变化
|
# 检查是否真正产生了变化
|
||||||
if optimized.name == self._current_module.name and not optimized.demos:
|
if optimized.name == self._current_module.name and not optimized.demos:
|
||||||
logger.debug("Optimization produced no meaningful changes")
|
logger.debug("Optimization produced no meaningful changes")
|
||||||
|
|
@ -240,9 +304,43 @@ class EvolutionMixin:
|
||||||
self._evolution_log.append(log_entry)
|
self._evolution_log.append(log_entry)
|
||||||
return log_entry
|
return log_entry
|
||||||
|
|
||||||
async def _optimize_with_context(
|
def _is_failure_path(self, result: TaskResult) -> bool:
|
||||||
self, module: Module, reflection: Reflection
|
"""Determine if a result should trigger failure-path evolution (100%).
|
||||||
) -> Module:
|
|
||||||
|
KTD-8: ``gave_up_after_reflections`` (U5) is treated as failure even when
|
||||||
|
the stream wrapper marks status as COMPLETED, because the reflexion loop
|
||||||
|
exhausted without producing a verified answer.
|
||||||
|
|
||||||
|
ponytail: string-matching on output_data/error_message is a heuristic;
|
||||||
|
upgrade path is a dedicated TaskResult.trace_outcome field.
|
||||||
|
"""
|
||||||
|
if result.status != TaskStatus.COMPLETED:
|
||||||
|
return True
|
||||||
|
# KTD-8: detect gave_up_after_reflections signal carried in output or error
|
||||||
|
if result.output_data and isinstance(result.output_data, dict):
|
||||||
|
if result.output_data.get("trace_outcome") == "gave_up_after_reflections":
|
||||||
|
return True
|
||||||
|
if result.error_message and "gave_up_after_reflections" in result.error_message:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def can_share_artifact(self, source_actor: str, target_actor: str) -> bool:
|
||||||
|
"""R6: check if an evolution artifact can be shared between workspaces.
|
||||||
|
|
||||||
|
Same-workspace sharing is always on. Cross-workspace sharing requires
|
||||||
|
explicit opt-in via ``EvolutionConfig.cross_workspace_sharing=True``.
|
||||||
|
|
||||||
|
Trust boundary: evolution products are agent-produced and must be
|
||||||
|
validated before entering the shared store.
|
||||||
|
"""
|
||||||
|
if source_actor == target_actor:
|
||||||
|
return True
|
||||||
|
cfg = self._auto_evolution_config
|
||||||
|
if cfg is not None and cfg.cross_workspace_sharing:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
async def _optimize_with_context(self, module: Module, reflection: Reflection) -> Module:
|
||||||
"""Run optimization, passing reflection context if optimizer supports it"""
|
"""Run optimization, passing reflection context if optimizer supports it"""
|
||||||
from agentkit.evolution.prompt_optimizer import LLMPromptOptimizer
|
from agentkit.evolution.prompt_optimizer import LLMPromptOptimizer
|
||||||
|
|
||||||
|
|
@ -263,11 +361,13 @@ class EvolutionMixin:
|
||||||
|
|
||||||
# Create test if not exists
|
# Create test if not exists
|
||||||
if test_id not in self._ab_tester._tests:
|
if test_id not in self._ab_tester._tests:
|
||||||
self._ab_tester.create_test(ABTestConfig(
|
self._ab_tester.create_test(
|
||||||
test_id=test_id,
|
ABTestConfig(
|
||||||
agent_name=result.agent_name,
|
test_id=test_id,
|
||||||
change_type="prompt",
|
agent_name=result.agent_name,
|
||||||
))
|
change_type="prompt",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# Assign group deterministically based on task_id
|
# Assign group deterministically based on task_id
|
||||||
group = self._ab_tester.assign_group(test_id, task_id=task.task_id)
|
group = self._ab_tester.assign_group(test_id, task_id=task.task_id)
|
||||||
|
|
@ -318,6 +418,9 @@ class EvolutionMixin:
|
||||||
"rolled_back": entry.rolled_back,
|
"rolled_back": entry.rolled_back,
|
||||||
"event_id": entry.event_id,
|
"event_id": entry.event_id,
|
||||||
"created_at": entry.created_at.isoformat(),
|
"created_at": entry.created_at.isoformat(),
|
||||||
|
"actor": entry.actor,
|
||||||
|
"sampled": entry.sampled,
|
||||||
|
"observe_only": entry.observe_only,
|
||||||
}
|
}
|
||||||
if entry.reflection:
|
if entry.reflection:
|
||||||
record["reflection"] = {
|
record["reflection"] = {
|
||||||
|
|
@ -444,9 +547,7 @@ class EvolutionMixin:
|
||||||
# 按 pattern 分类累积反思(patterns为空时使用默认category)
|
# 按 pattern 分类累积反思(patterns为空时使用默认category)
|
||||||
categories = reflection.patterns if reflection.patterns else ["default"]
|
categories = reflection.patterns if reflection.patterns else ["default"]
|
||||||
for pattern in categories:
|
for pattern in categories:
|
||||||
self.record_reflection(
|
self.record_reflection(pattern, reflection, task_type=task_type, score=score)
|
||||||
pattern, reflection, task_type=task_type, score=score
|
|
||||||
)
|
|
||||||
|
|
||||||
# 检查是否有类别满足触发条件
|
# 检查是否有类别满足触发条件
|
||||||
for category, reflections in list(self.pending_soul_updates.items()):
|
for category, reflections in list(self.pending_soul_updates.items()):
|
||||||
|
|
@ -455,9 +556,7 @@ class EvolutionMixin:
|
||||||
quality_gradient_triggered = False
|
quality_gradient_triggered = False
|
||||||
if len(scores) >= 3:
|
if len(scores) >= 3:
|
||||||
last_3 = scores[-3:]
|
last_3 = scores[-3:]
|
||||||
declines = [
|
declines = [last_3[i] - last_3[i - 1] for i in range(1, len(last_3))]
|
||||||
last_3[i] - last_3[i - 1] for i in range(1, len(last_3))
|
|
||||||
]
|
|
||||||
if all(d <= config.quality_gradient_threshold for d in declines):
|
if all(d <= config.quality_gradient_threshold for d in declines):
|
||||||
quality_gradient_triggered = True
|
quality_gradient_triggered = True
|
||||||
|
|
||||||
|
|
@ -467,7 +566,7 @@ class EvolutionMixin:
|
||||||
for r in reflections:
|
for r in reflections:
|
||||||
age_seconds = (now - r["timestamp"]).total_seconds()
|
age_seconds = (now - r["timestamp"]).total_seconds()
|
||||||
age_hours = age_seconds / 3600.0
|
age_hours = age_seconds / 3600.0
|
||||||
effective_count += config.time_decay_factor ** age_hours
|
effective_count += config.time_decay_factor**age_hours
|
||||||
# Round to avoid floating-point precision issues
|
# Round to avoid floating-point precision issues
|
||||||
# (e.g. 3 recent reflections should yield exactly 3.0)
|
# (e.g. 3 recent reflections should yield exactly 3.0)
|
||||||
effective_count = round(effective_count, 6)
|
effective_count = round(effective_count, 6)
|
||||||
|
|
@ -506,8 +605,7 @@ class EvolutionMixin:
|
||||||
|
|
||||||
if update_result.get("success"):
|
if update_result.get("success"):
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Soul evolved: category={category}, "
|
f"Soul evolved: category={category}, version={update_result.get('version')}"
|
||||||
f"version={update_result.get('version')}"
|
|
||||||
)
|
)
|
||||||
# 清除已处理的类别
|
# 清除已处理的类别
|
||||||
del self.pending_soul_updates[category]
|
del self.pending_soul_updates[category]
|
||||||
|
|
|
||||||
|
|
@ -33,6 +33,9 @@ class PitfallWarning:
|
||||||
failure_rate: 历史失败率(0.0 ~ 1.0)
|
failure_rate: 历史失败率(0.0 ~ 1.0)
|
||||||
historical_failures: 历史失败原因列表
|
historical_failures: 历史失败原因列表
|
||||||
suggestion: 优化建议
|
suggestion: 优化建议
|
||||||
|
confidence: 置信度(0.0 ~ 1.0),综合 failure_rate 和样本量计算
|
||||||
|
actor: 产生此预警对应的 agent/expert 标识(R6 actor marking)
|
||||||
|
observe_only: 低置信度预警标记为 observe-only,记录但不驱动优化
|
||||||
"""
|
"""
|
||||||
|
|
||||||
step_name: str
|
step_name: str
|
||||||
|
|
@ -40,6 +43,12 @@ class PitfallWarning:
|
||||||
failure_rate: float
|
failure_rate: float
|
||||||
historical_failures: list[str] = field(default_factory=list)
|
historical_failures: list[str] = field(default_factory=list)
|
||||||
suggestion: str = ""
|
suggestion: str = ""
|
||||||
|
# U6/R5: confidence score for quality gate before ingestion
|
||||||
|
confidence: float = 0.0
|
||||||
|
# U6/R6: actor marking — which agent/expert produced the underlying experiences
|
||||||
|
actor: str = ""
|
||||||
|
# U6/R5: low-confidence warnings are marked observe-only (not discarded)
|
||||||
|
observe_only: bool = False
|
||||||
|
|
||||||
|
|
||||||
class ExperienceStoreProtocol(Protocol):
|
class ExperienceStoreProtocol(Protocol):
|
||||||
|
|
@ -51,8 +60,7 @@ class ExperienceStoreProtocol(Protocol):
|
||||||
top_k: int = 5,
|
top_k: int = 5,
|
||||||
task_type: str | None = None,
|
task_type: str | None = None,
|
||||||
search_multiplier: int = 5,
|
search_multiplier: int = 5,
|
||||||
) -> list[Any]:
|
) -> list[Any]: ...
|
||||||
...
|
|
||||||
|
|
||||||
|
|
||||||
# 预警级别阈值
|
# 预警级别阈值
|
||||||
|
|
@ -89,27 +97,33 @@ class PitfallDetector:
|
||||||
experience_store: ExperienceStoreProtocol,
|
experience_store: ExperienceStoreProtocol,
|
||||||
similarity_threshold: float = 0.3,
|
similarity_threshold: float = 0.3,
|
||||||
max_search_results: int = 50,
|
max_search_results: int = 50,
|
||||||
|
min_confidence: float = 0.0,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
experience_store: 经验存储实例(ExperienceStore 或 InMemoryExperienceStore)
|
experience_store: 经验存储实例(ExperienceStore 或 InMemoryExperienceStore)
|
||||||
similarity_threshold: 步骤名称关键词匹配的最小相似度阈值
|
similarity_threshold: 步骤名称关键词匹配的最小相似度阈值
|
||||||
max_search_results: 从经验存储检索的最大结果数
|
max_search_results: 从经验存储检索的最大结果数
|
||||||
|
min_confidence: 置信度阈值(U6/R5)。低于此值的预警标记为 observe_only。
|
||||||
|
默认 0.0 表示不过滤(保持向后兼容)。
|
||||||
"""
|
"""
|
||||||
self._store = experience_store
|
self._store = experience_store
|
||||||
self._similarity_threshold = similarity_threshold
|
self._similarity_threshold = similarity_threshold
|
||||||
self._max_search_results = max_search_results
|
self._max_search_results = max_search_results
|
||||||
|
self._min_confidence = min_confidence
|
||||||
|
|
||||||
async def check_pitfalls(
|
async def check_pitfalls(
|
||||||
self,
|
self,
|
||||||
task_type: str,
|
task_type: str,
|
||||||
planned_steps: list[Any],
|
planned_steps: list[Any],
|
||||||
|
actor: str = "",
|
||||||
) -> list[PitfallWarning]:
|
) -> list[PitfallWarning]:
|
||||||
"""检查计划步骤中的潜在陷阱
|
"""检查计划步骤中的潜在陷阱
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
task_type: 任务类型
|
task_type: 任务类型
|
||||||
planned_steps: 计划步骤列表(PlanStep 对象或具有 name/description 属性的对象)
|
planned_steps: 计划步骤列表(PlanStep 对象或具有 name/description 属性的对象)
|
||||||
|
actor: 产生此检测请求的 agent/expert 标识(R6 actor marking)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
按严重程度排序的预警列表(HIGH → MEDIUM → LOW)
|
按严重程度排序的预警列表(HIGH → MEDIUM → LOW)
|
||||||
|
|
@ -127,7 +141,7 @@ class PitfallDetector:
|
||||||
step_failure_stats = self._extract_step_failure_stats(all_experiences)
|
step_failure_stats = self._extract_step_failure_stats(all_experiences)
|
||||||
|
|
||||||
# 3. 匹配当前计划步骤并生成预警
|
# 3. 匹配当前计划步骤并生成预警
|
||||||
warnings = self._match_and_warn(planned_steps, step_failure_stats)
|
warnings = self._match_and_warn(planned_steps, step_failure_stats, actor=actor)
|
||||||
|
|
||||||
# 4. 按严重程度排序(HIGH → MEDIUM → LOW),同级别按失败率降序
|
# 4. 按严重程度排序(HIGH → MEDIUM → LOW),同级别按失败率降序
|
||||||
warnings.sort(key=lambda w: (_warning_level_order(w.warning_level), -w.failure_rate))
|
warnings.sort(key=lambda w: (_warning_level_order(w.warning_level), -w.failure_rate))
|
||||||
|
|
@ -208,8 +222,8 @@ class PitfallDetector:
|
||||||
s.failure_reasons.append(error)
|
s.failure_reasons.append(error)
|
||||||
|
|
||||||
# 收集优化建议 — only add to steps that are part of this experience
|
# 收集优化建议 — only add to steps that are part of this experience
|
||||||
if hasattr(exp, 'optimization_tips') and exp.optimization_tips:
|
if hasattr(exp, "optimization_tips") and exp.optimization_tips:
|
||||||
experience_steps = set(exp.steps) if hasattr(exp, 'steps') and exp.steps else set()
|
experience_steps = set(exp.steps) if hasattr(exp, "steps") and exp.steps else set()
|
||||||
for step_name, s in stats.items():
|
for step_name, s in stats.items():
|
||||||
if experience_steps and step_name in experience_steps:
|
if experience_steps and step_name in experience_steps:
|
||||||
s.optimization_tips.extend(exp.optimization_tips)
|
s.optimization_tips.extend(exp.optimization_tips)
|
||||||
|
|
@ -220,6 +234,7 @@ class PitfallDetector:
|
||||||
self,
|
self,
|
||||||
planned_steps: list[Any],
|
planned_steps: list[Any],
|
||||||
step_failure_stats: dict[str, _StepFailureStats],
|
step_failure_stats: dict[str, _StepFailureStats],
|
||||||
|
actor: str = "",
|
||||||
) -> list[PitfallWarning]:
|
) -> list[PitfallWarning]:
|
||||||
"""将计划步骤与失败统计进行匹配,生成预警"""
|
"""将计划步骤与失败统计进行匹配,生成预警"""
|
||||||
warnings: list[PitfallWarning] = []
|
warnings: list[PitfallWarning] = []
|
||||||
|
|
@ -236,9 +251,7 @@ class PitfallDetector:
|
||||||
best_similarity = 0.0
|
best_similarity = 0.0
|
||||||
|
|
||||||
for stats_step_name, stats in step_failure_stats.items():
|
for stats_step_name, stats in step_failure_stats.items():
|
||||||
similarity = _compute_name_similarity(
|
similarity = _compute_name_similarity(step_name, step_description, stats_step_name)
|
||||||
step_name, step_description, stats_step_name
|
|
||||||
)
|
|
||||||
if similarity > best_similarity:
|
if similarity > best_similarity:
|
||||||
best_similarity = similarity
|
best_similarity = similarity
|
||||||
best_match = stats
|
best_match = stats
|
||||||
|
|
@ -254,18 +267,29 @@ class PitfallDetector:
|
||||||
else 0.0
|
else 0.0
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# U6/R5: compute confidence from failure_rate and sample size.
|
||||||
|
# ponytail: linear ramp to 3 samples; upgrade to Wilson interval
|
||||||
|
# if precision matters at low sample counts.
|
||||||
|
confidence = _compute_confidence(failure_rate, best_match.total_occurrences)
|
||||||
|
|
||||||
# 分配预警级别
|
# 分配预警级别
|
||||||
warning_level = _determine_warning_level(failure_rate)
|
warning_level = _determine_warning_level(failure_rate)
|
||||||
|
|
||||||
# 生成建议
|
# 生成建议
|
||||||
suggestion = _build_suggestion(best_match, failure_rate)
|
suggestion = _build_suggestion(best_match, failure_rate)
|
||||||
|
|
||||||
|
# U6/R5: low-confidence warnings are marked observe-only (not discarded)
|
||||||
|
observe_only = self._min_confidence > 0.0 and confidence < self._min_confidence
|
||||||
|
|
||||||
warning = PitfallWarning(
|
warning = PitfallWarning(
|
||||||
step_name=step_name,
|
step_name=step_name,
|
||||||
warning_level=warning_level,
|
warning_level=warning_level,
|
||||||
failure_rate=round(failure_rate, 4),
|
failure_rate=round(failure_rate, 4),
|
||||||
historical_failures=best_match.failure_reasons[:5], # 最多保留 5 条
|
historical_failures=best_match.failure_reasons[:5], # 最多保留 5 条
|
||||||
suggestion=suggestion,
|
suggestion=suggestion,
|
||||||
|
confidence=round(confidence, 4),
|
||||||
|
actor=actor,
|
||||||
|
observe_only=observe_only,
|
||||||
)
|
)
|
||||||
warnings.append(warning)
|
warnings.append(warning)
|
||||||
|
|
||||||
|
|
@ -321,12 +345,48 @@ def _compute_name_similarity(
|
||||||
return len(intersection) / len(union)
|
return len(intersection) / len(union)
|
||||||
|
|
||||||
|
|
||||||
_STOP_WORDS = frozenset({
|
_STOP_WORDS = frozenset(
|
||||||
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
|
{
|
||||||
"of", "with", "by", "from", "is", "are", "was", "were", "be", "been",
|
"a",
|
||||||
"being", "have", "has", "had", "do", "does", "did", "will", "would",
|
"an",
|
||||||
"could", "should", "may", "might", "can", "shall", "not", "no",
|
"the",
|
||||||
})
|
"and",
|
||||||
|
"or",
|
||||||
|
"but",
|
||||||
|
"in",
|
||||||
|
"on",
|
||||||
|
"at",
|
||||||
|
"to",
|
||||||
|
"for",
|
||||||
|
"of",
|
||||||
|
"with",
|
||||||
|
"by",
|
||||||
|
"from",
|
||||||
|
"is",
|
||||||
|
"are",
|
||||||
|
"was",
|
||||||
|
"were",
|
||||||
|
"be",
|
||||||
|
"been",
|
||||||
|
"being",
|
||||||
|
"have",
|
||||||
|
"has",
|
||||||
|
"had",
|
||||||
|
"do",
|
||||||
|
"does",
|
||||||
|
"did",
|
||||||
|
"will",
|
||||||
|
"would",
|
||||||
|
"could",
|
||||||
|
"should",
|
||||||
|
"may",
|
||||||
|
"might",
|
||||||
|
"can",
|
||||||
|
"shall",
|
||||||
|
"not",
|
||||||
|
"no",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _extract_keywords(text: str) -> frozenset[str]:
|
def _extract_keywords(text: str) -> frozenset[str]:
|
||||||
|
|
@ -337,10 +397,7 @@ def _extract_keywords(text: str) -> frozenset[str]:
|
||||||
# 统一分隔符
|
# 统一分隔符
|
||||||
normalized = text.lower().replace("_", " ").replace("-", " ")
|
normalized = text.lower().replace("_", " ").replace("-", " ")
|
||||||
words = normalized.split()
|
words = normalized.split()
|
||||||
return frozenset(
|
return frozenset(w for w in words if len(w) > 1 and w not in _STOP_WORDS)
|
||||||
w for w in words
|
|
||||||
if len(w) > 1 and w not in _STOP_WORDS
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _determine_warning_level(failure_rate: float) -> WarningLevel:
|
def _determine_warning_level(failure_rate: float) -> WarningLevel:
|
||||||
|
|
@ -357,6 +414,33 @@ def _determine_warning_level(failure_rate: float) -> WarningLevel:
|
||||||
return WarningLevel.LOW
|
return WarningLevel.LOW
|
||||||
|
|
||||||
|
|
||||||
|
# U6/R5: minimum sample count for full confidence
|
||||||
|
_CONFIDENCE_FULL_SAMPLES = 3
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_confidence(failure_rate: float, total_occurrences: int) -> float:
|
||||||
|
"""Compute confidence score for a pitfall warning.
|
||||||
|
|
||||||
|
Combines failure_rate with sample size: small samples reduce confidence
|
||||||
|
linearly. A warning based on 1 occurrence is low-confidence even if the
|
||||||
|
failure_rate is high; 3+ occurrences yield full confidence.
|
||||||
|
|
||||||
|
ponytail: linear ramp is a naive heuristic; upgrade path is a Wilson
|
||||||
|
score interval for statistically rigorous low-sample confidence bounds.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
failure_rate: Historical failure rate (0.0 ~ 1.0).
|
||||||
|
total_occurrences: Total number of times this step was observed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Confidence score (0.0 ~ 1.0).
|
||||||
|
"""
|
||||||
|
if total_occurrences <= 0:
|
||||||
|
return 0.0
|
||||||
|
sample_factor = min(1.0, total_occurrences / _CONFIDENCE_FULL_SAMPLES)
|
||||||
|
return failure_rate * sample_factor
|
||||||
|
|
||||||
|
|
||||||
def _warning_level_order(level: WarningLevel) -> int:
|
def _warning_level_order(level: WarningLevel) -> int:
|
||||||
"""预警级别排序值(越小越严重)"""
|
"""预警级别排序值(越小越严重)"""
|
||||||
return {
|
return {
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,7 @@ logger = logging.getLogger(__name__)
|
||||||
@dataclass
|
@dataclass
|
||||||
class Signature:
|
class Signature:
|
||||||
"""Prompt 签名 - 定义输入/输出字段"""
|
"""Prompt 签名 - 定义输入/输出字段"""
|
||||||
|
|
||||||
input_fields: dict[str, str] # name -> description
|
input_fields: dict[str, str] # name -> description
|
||||||
output_fields: dict[str, str] # name -> description
|
output_fields: dict[str, str] # name -> description
|
||||||
instruction: str = ""
|
instruction: str = ""
|
||||||
|
|
@ -41,10 +42,13 @@ class Signature:
|
||||||
@dataclass
|
@dataclass
|
||||||
class Module:
|
class Module:
|
||||||
"""可组合的 Prompt 策略模块"""
|
"""可组合的 Prompt 策略模块"""
|
||||||
|
|
||||||
name: str
|
name: str
|
||||||
signature: Signature
|
signature: Signature
|
||||||
template: str = ""
|
template: str = ""
|
||||||
demos: list[dict[str, Any]] = field(default_factory=list)
|
demos: list[dict[str, Any]] = field(default_factory=list)
|
||||||
|
# U6/R6: actor marking — which agent/expert produced this optimized module
|
||||||
|
actor: str = ""
|
||||||
|
|
||||||
def render(self, **kwargs) -> str:
|
def render(self, **kwargs) -> str:
|
||||||
parts = []
|
parts = []
|
||||||
|
|
@ -80,18 +84,42 @@ class BootstrapPromptOptimizer:
|
||||||
input_data: dict,
|
input_data: dict,
|
||||||
output_data: dict,
|
output_data: dict,
|
||||||
quality_score: float,
|
quality_score: float,
|
||||||
|
actor: str = "",
|
||||||
) -> None:
|
) -> None:
|
||||||
"""添加训练样本"""
|
"""添加训练样本"""
|
||||||
example = {
|
example = {
|
||||||
"input": input_data,
|
"input": input_data,
|
||||||
"output": output_data,
|
"output": output_data,
|
||||||
"quality_score": quality_score,
|
"quality_score": quality_score,
|
||||||
|
"actor": actor,
|
||||||
}
|
}
|
||||||
if quality_score >= 0.7:
|
if quality_score >= 0.7:
|
||||||
self._success_examples.append(example)
|
self._success_examples.append(example)
|
||||||
else:
|
else:
|
||||||
self._failure_examples.append(example)
|
self._failure_examples.append(example)
|
||||||
|
|
||||||
|
def can_optimize(self, min_confidence: float = 0.5, min_examples: int | None = None) -> bool:
|
||||||
|
"""U6/R5: consumption gate — sample count and confidence达标.
|
||||||
|
|
||||||
|
Returns True only when:
|
||||||
|
1. Success example count >= min_examples (default: constructor's
|
||||||
|
``min_examples_for_optimization``)
|
||||||
|
2. Mean quality score of success examples >= min_confidence
|
||||||
|
|
||||||
|
ponytail: mean-quality gate is redundant with the >= 0.7 success
|
||||||
|
threshold in add_example when min_confidence <= 0.7; upgrade path
|
||||||
|
is a diversity-weighted confidence metric if noise becomes an issue.
|
||||||
|
"""
|
||||||
|
threshold = min_examples if min_examples is not None else self._min_examples
|
||||||
|
if len(self._success_examples) < threshold:
|
||||||
|
return False
|
||||||
|
if not self._success_examples:
|
||||||
|
return False
|
||||||
|
mean_quality = sum(ex["quality_score"] for ex in self._success_examples) / len(
|
||||||
|
self._success_examples
|
||||||
|
)
|
||||||
|
return mean_quality >= min_confidence
|
||||||
|
|
||||||
async def optimize(self, module: Module) -> Module:
|
async def optimize(self, module: Module) -> Module:
|
||||||
"""优化 Module 的 Prompt
|
"""优化 Module 的 Prompt
|
||||||
|
|
||||||
|
|
@ -110,15 +138,17 @@ class BootstrapPromptOptimizer:
|
||||||
key=lambda x: x["quality_score"],
|
key=lambda x: x["quality_score"],
|
||||||
reverse=True,
|
reverse=True,
|
||||||
)
|
)
|
||||||
best_demos = sorted_examples[:self._max_demos]
|
best_demos = sorted_examples[: self._max_demos]
|
||||||
|
|
||||||
# 构建 few-shot 示例
|
# 构建 few-shot 示例
|
||||||
demos = []
|
demos = []
|
||||||
for example in best_demos:
|
for example in best_demos:
|
||||||
demos.append({
|
demos.append(
|
||||||
"input": str(example["input"]),
|
{
|
||||||
"output": str(example["output"]),
|
"input": str(example["input"]),
|
||||||
})
|
"output": str(example["output"]),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# 优化指令(基于失败案例的反面教材)
|
# 优化指令(基于失败案例的反面教材)
|
||||||
optimized_instruction = module.signature.instruction
|
optimized_instruction = module.signature.instruction
|
||||||
|
|
@ -127,9 +157,8 @@ class BootstrapPromptOptimizer:
|
||||||
for ex in self._failure_examples[-3:]:
|
for ex in self._failure_examples[-3:]:
|
||||||
failure_patterns.add(str(ex["input"])[:100])
|
failure_patterns.add(str(ex["input"])[:100])
|
||||||
if failure_patterns:
|
if failure_patterns:
|
||||||
optimized_instruction += (
|
optimized_instruction += "\n\nAvoid these patterns:\n" + "\n".join(
|
||||||
f"\n\nAvoid these patterns:\n"
|
f"- {p}" for p in failure_patterns
|
||||||
+ "\n".join(f"- {p}" for p in failure_patterns)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# 创建优化后的 Module
|
# 创建优化后的 Module
|
||||||
|
|
@ -186,9 +215,16 @@ class LLMPromptOptimizer:
|
||||||
input_data: dict,
|
input_data: dict,
|
||||||
output_data: dict,
|
output_data: dict,
|
||||||
quality_score: float,
|
quality_score: float,
|
||||||
|
actor: str = "",
|
||||||
) -> None:
|
) -> None:
|
||||||
"""添加训练样本(委托给 bootstrap 优化器)"""
|
"""添加训练样本(委托给 bootstrap 优化器)"""
|
||||||
self._bootstrap.add_example(input_data, output_data, quality_score)
|
self._bootstrap.add_example(input_data, output_data, quality_score, actor=actor)
|
||||||
|
|
||||||
|
def can_optimize(self, min_confidence: float = 0.5, min_examples: int | None = None) -> bool:
|
||||||
|
"""U6/R5: consumption gate — delegates to bootstrap optimizer."""
|
||||||
|
return self._bootstrap.can_optimize(
|
||||||
|
min_confidence=min_confidence, min_examples=min_examples
|
||||||
|
)
|
||||||
|
|
||||||
async def optimize(self, module: Module, trace: Any = None, reflection: Any = None) -> Module:
|
async def optimize(self, module: Module, trace: Any = None, reflection: Any = None) -> Module:
|
||||||
"""使用 LLM 优化 Module 的 Prompt
|
"""使用 LLM 优化 Module 的 Prompt
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,879 @@
|
||||||
|
"""Tests for U6: auto evolution trigger + quality gate + actor marking.
|
||||||
|
|
||||||
|
Covers R5 (success sample rate, quality thresholds, observe-only) and
|
||||||
|
R6 (actor marking, cross-workspace sharing gate).
|
||||||
|
|
||||||
|
Test scenarios:
|
||||||
|
- Happy path (AE3): failure -> evolution fires (100%); success -> fires at 0.1 rate
|
||||||
|
- Observe-only mode: recorded but not fed to optimizer
|
||||||
|
- Backpressure cap reached: evolution task dropped + logged
|
||||||
|
- Low-confidence pitfall: marked observe-only
|
||||||
|
- Evolution task error: caught, does not fail the stream
|
||||||
|
- PromptOptimizer sample count < 3: skip optimization
|
||||||
|
- Actor marking present on all artifacts
|
||||||
|
- Cross-workspace sharing rejected without opt-in
|
||||||
|
- gave_up_after_reflections triggers failure-path evolution
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from agentkit.core.protocol import TaskMessage, TaskResult, TaskStatus
|
||||||
|
from agentkit.evolution.config import EvolutionConfig
|
||||||
|
from agentkit.evolution.experience_schema import TaskExperience
|
||||||
|
from agentkit.evolution.experience_store import InMemoryExperienceStore
|
||||||
|
from agentkit.evolution.lifecycle import EvolutionMixin
|
||||||
|
from agentkit.evolution.pitfall_detector import (
|
||||||
|
PitfallDetector,
|
||||||
|
WarningLevel,
|
||||||
|
_compute_confidence,
|
||||||
|
)
|
||||||
|
from agentkit.evolution.prompt_optimizer import Module, PromptOptimizer, Signature
|
||||||
|
from agentkit.evolution.reflector import Reflection, Reflector
|
||||||
|
|
||||||
|
|
||||||
|
# ── Helpers ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _make_task(
|
||||||
|
task_id: str = "test-001",
|
||||||
|
agent_name: str = "evolving_agent",
|
||||||
|
) -> TaskMessage:
|
||||||
|
return TaskMessage(
|
||||||
|
task_id=task_id,
|
||||||
|
agent_name=agent_name,
|
||||||
|
task_type="echo",
|
||||||
|
priority=0,
|
||||||
|
input_data={"query": "hello"},
|
||||||
|
callback_url=None,
|
||||||
|
created_at=datetime.now(timezone.utc),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_result(
|
||||||
|
status: str = TaskStatus.COMPLETED,
|
||||||
|
output_data: dict | None = None,
|
||||||
|
error_message: str | None = None,
|
||||||
|
agent_name: str = "evolving_agent",
|
||||||
|
task_id: str = "test-001",
|
||||||
|
) -> TaskResult:
|
||||||
|
return TaskResult(
|
||||||
|
task_id=task_id,
|
||||||
|
agent_name=agent_name,
|
||||||
|
status=status,
|
||||||
|
output_data=output_data if output_data is not None else {"key": "value"},
|
||||||
|
error_message=error_message,
|
||||||
|
started_at=datetime.now(timezone.utc),
|
||||||
|
completed_at=datetime.now(timezone.utc),
|
||||||
|
metrics={"elapsed_seconds": 5.0},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_failure_result(
|
||||||
|
agent_name: str = "evolving_agent",
|
||||||
|
task_id: str = "test-001",
|
||||||
|
) -> TaskResult:
|
||||||
|
return _make_result(
|
||||||
|
status=TaskStatus.FAILED,
|
||||||
|
output_data=None,
|
||||||
|
error_message="task failed",
|
||||||
|
agent_name=agent_name,
|
||||||
|
task_id=task_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _make_module() -> Module:
|
||||||
|
return Module(
|
||||||
|
name="test_module",
|
||||||
|
signature=Signature(
|
||||||
|
input_fields={"query": "search query"},
|
||||||
|
output_fields={"result": "search result"},
|
||||||
|
instruction="Find the best result.",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class LowQualityReflector(Reflector):
|
||||||
|
"""Always produces failure outcome with improvement suggestions."""
|
||||||
|
|
||||||
|
async def reflect(self, task: TaskMessage, result: TaskResult) -> Reflection:
|
||||||
|
return Reflection(
|
||||||
|
task_id=task.task_id,
|
||||||
|
agent_name=result.agent_name,
|
||||||
|
outcome="failure",
|
||||||
|
quality_score=0.2,
|
||||||
|
patterns=["slow_execution"],
|
||||||
|
insights=["Low quality score indicates potential issues"],
|
||||||
|
suggestions=["Consider prompt optimization for this task type"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class SuccessReflector(Reflector):
|
||||||
|
"""Always produces success outcome with suggestions (for testing success-path)."""
|
||||||
|
|
||||||
|
async def reflect(self, task: TaskMessage, result: TaskResult) -> Reflection:
|
||||||
|
return Reflection(
|
||||||
|
task_id=task.task_id,
|
||||||
|
agent_name=result.agent_name,
|
||||||
|
outcome="success",
|
||||||
|
quality_score=0.9,
|
||||||
|
patterns=["fast_execution"],
|
||||||
|
insights=["Good execution"],
|
||||||
|
suggestions=["Consider caching results for similar queries"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class ErrorReflector(Reflector):
|
||||||
|
"""Always raises during reflection."""
|
||||||
|
|
||||||
|
async def reflect(self, task: TaskMessage, result: TaskResult) -> Reflection:
|
||||||
|
raise RuntimeError("reflector crashed")
|
||||||
|
|
||||||
|
|
||||||
|
def _make_experience(
|
||||||
|
task_type: str = "code_review",
|
||||||
|
outcome: str = "failure",
|
||||||
|
steps_summary: str | list = "",
|
||||||
|
success_rate: float = 0.0,
|
||||||
|
) -> TaskExperience:
|
||||||
|
return TaskExperience(
|
||||||
|
experience_id="",
|
||||||
|
task_type=task_type,
|
||||||
|
goal="test goal",
|
||||||
|
steps_summary=steps_summary,
|
||||||
|
outcome=outcome,
|
||||||
|
duration_seconds=10.0,
|
||||||
|
success_rate=success_rate,
|
||||||
|
failure_reasons=[],
|
||||||
|
optimization_tips=[],
|
||||||
|
created_at=datetime.now(timezone.utc),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── R5: Success sample rate gate ─────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestSuccessSampleRate:
|
||||||
|
"""R5: success-path evolution gated by success_sample_rate; failure always runs."""
|
||||||
|
|
||||||
|
async def test_failure_always_triggers_evolution(self):
|
||||||
|
"""Failure path always triggers evolution regardless of sample rate."""
|
||||||
|
cfg = EvolutionConfig(success_sample_rate=0.0, observe_only=False)
|
||||||
|
reflector = LowQualityReflector()
|
||||||
|
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
|
||||||
|
mixin.set_current_module(_make_module())
|
||||||
|
|
||||||
|
task = _make_task()
|
||||||
|
result = _make_failure_result()
|
||||||
|
entry = await mixin.evolve_after_task(task, result)
|
||||||
|
|
||||||
|
assert entry.sampled is True
|
||||||
|
assert entry.reflection is not None
|
||||||
|
assert entry.reflection.outcome == "failure"
|
||||||
|
|
||||||
|
async def test_success_skipped_when_rate_zero(self):
|
||||||
|
"""Success path skipped when success_sample_rate=0.0."""
|
||||||
|
cfg = EvolutionConfig(success_sample_rate=0.0, observe_only=False)
|
||||||
|
reflector = SuccessReflector()
|
||||||
|
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
|
||||||
|
|
||||||
|
task = _make_task()
|
||||||
|
result = _make_result(status=TaskStatus.COMPLETED)
|
||||||
|
entry = await mixin.evolve_after_task(task, result)
|
||||||
|
|
||||||
|
assert entry.sampled is False
|
||||||
|
assert entry.reflection is None # evolution skipped before reflection
|
||||||
|
|
||||||
|
async def test_success_runs_when_rate_one(self):
|
||||||
|
"""Success path runs when success_sample_rate=1.0."""
|
||||||
|
cfg = EvolutionConfig(success_sample_rate=1.0, observe_only=False)
|
||||||
|
reflector = SuccessReflector()
|
||||||
|
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
|
||||||
|
|
||||||
|
task = _make_task()
|
||||||
|
result = _make_result(status=TaskStatus.COMPLETED)
|
||||||
|
entry = await mixin.evolve_after_task(task, result)
|
||||||
|
|
||||||
|
assert entry.sampled is True
|
||||||
|
assert entry.reflection is not None
|
||||||
|
assert entry.reflection.outcome == "success"
|
||||||
|
|
||||||
|
async def test_success_sampled_at_rate_boundary(self):
|
||||||
|
"""At rate=0.1, random < 0.1 runs; random >= 0.1 skips."""
|
||||||
|
cfg = EvolutionConfig(success_sample_rate=0.1, observe_only=False)
|
||||||
|
reflector = SuccessReflector()
|
||||||
|
|
||||||
|
# random < 0.1 -> evolution runs
|
||||||
|
mixin_run = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
|
||||||
|
with patch("agentkit.evolution.lifecycle.random.random", return_value=0.05):
|
||||||
|
entry = await mixin_run.evolve_after_task(
|
||||||
|
_make_task(), _make_result(status=TaskStatus.COMPLETED)
|
||||||
|
)
|
||||||
|
assert entry.sampled is True
|
||||||
|
assert entry.reflection is not None
|
||||||
|
|
||||||
|
# random >= 0.1 -> evolution skipped
|
||||||
|
mixin_skip = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
|
||||||
|
with patch("agentkit.evolution.lifecycle.random.random", return_value=0.15):
|
||||||
|
entry = await mixin_skip.evolve_after_task(
|
||||||
|
_make_task(), _make_result(status=TaskStatus.COMPLETED)
|
||||||
|
)
|
||||||
|
assert entry.sampled is False
|
||||||
|
assert entry.reflection is None
|
||||||
|
|
||||||
|
async def test_no_config_preserves_backward_compat(self):
|
||||||
|
"""Without auto_evolution_config, no sample gate applies (backward compat)."""
|
||||||
|
reflector = SuccessReflector()
|
||||||
|
mixin = EvolutionMixin(reflector=reflector)
|
||||||
|
|
||||||
|
task = _make_task()
|
||||||
|
result = _make_result(status=TaskStatus.COMPLETED)
|
||||||
|
entry = await mixin.evolve_after_task(task, result)
|
||||||
|
|
||||||
|
assert entry.sampled is True
|
||||||
|
assert entry.reflection is not None
|
||||||
|
|
||||||
|
|
||||||
|
# ── R5: Observe-only mode ────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestObserveOnly:
|
||||||
|
"""R5: observe-only mode records but does not feed optimizer."""
|
||||||
|
|
||||||
|
async def test_observe_only_records_without_optimizing(self):
|
||||||
|
"""Observe-only: reflection recorded, optimizer not fed."""
|
||||||
|
cfg = EvolutionConfig(success_sample_rate=1.0, observe_only=True, min_confidence=0.0)
|
||||||
|
reflector = LowQualityReflector()
|
||||||
|
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1)
|
||||||
|
mixin = EvolutionMixin(
|
||||||
|
reflector=reflector,
|
||||||
|
prompt_optimizer=optimizer,
|
||||||
|
auto_evolution_config=cfg,
|
||||||
|
)
|
||||||
|
mixin.set_current_module(_make_module())
|
||||||
|
|
||||||
|
task = _make_task()
|
||||||
|
result = _make_failure_result()
|
||||||
|
entry = await mixin.evolve_after_task(task, result)
|
||||||
|
|
||||||
|
assert entry.observe_only is True
|
||||||
|
assert entry.reflection is not None
|
||||||
|
assert entry.optimized_module is None
|
||||||
|
# Optimizer should NOT have been fed
|
||||||
|
success_count, _ = optimizer.example_count
|
||||||
|
assert success_count == 0
|
||||||
|
|
||||||
|
async def test_observe_only_false_allows_optimization(self):
|
||||||
|
"""When observe_only=False, optimization can proceed (if gates pass)."""
|
||||||
|
cfg = EvolutionConfig(success_sample_rate=1.0, observe_only=False, min_confidence=0.0)
|
||||||
|
reflector = LowQualityReflector()
|
||||||
|
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1)
|
||||||
|
# Pre-fill enough success examples to pass consumption gate
|
||||||
|
for i in range(3):
|
||||||
|
optimizer.add_example(
|
||||||
|
input_data={"query": f"q_{i}"},
|
||||||
|
output_data={"result": f"r_{i}"},
|
||||||
|
quality_score=0.9,
|
||||||
|
)
|
||||||
|
mixin = EvolutionMixin(
|
||||||
|
reflector=reflector,
|
||||||
|
prompt_optimizer=optimizer,
|
||||||
|
auto_evolution_config=cfg,
|
||||||
|
)
|
||||||
|
mixin.set_current_module(_make_module())
|
||||||
|
|
||||||
|
task = _make_task()
|
||||||
|
result = _make_failure_result()
|
||||||
|
entry = await mixin.evolve_after_task(task, result)
|
||||||
|
|
||||||
|
assert entry.observe_only is False
|
||||||
|
assert entry.optimized_module is not None
|
||||||
|
|
||||||
|
|
||||||
|
# ── R5: PromptOptimizer consumption gate ─────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestConsumptionGate:
|
||||||
|
"""R5: optimizer consumption gate — sample count >= min_examples AND confidence."""
|
||||||
|
|
||||||
|
async def test_sample_count_below_threshold_skips_optimization(self):
|
||||||
|
"""PromptOptimizer sample count < min_examples -> skip optimization."""
|
||||||
|
cfg = EvolutionConfig(
|
||||||
|
success_sample_rate=1.0,
|
||||||
|
observe_only=False,
|
||||||
|
min_examples=3,
|
||||||
|
min_confidence=0.0,
|
||||||
|
)
|
||||||
|
reflector = LowQualityReflector()
|
||||||
|
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=3)
|
||||||
|
# Only 2 success examples — below threshold
|
||||||
|
for i in range(2):
|
||||||
|
optimizer.add_example(
|
||||||
|
input_data={"query": f"q_{i}"},
|
||||||
|
output_data={"result": f"r_{i}"},
|
||||||
|
quality_score=0.9,
|
||||||
|
)
|
||||||
|
mixin = EvolutionMixin(
|
||||||
|
reflector=reflector,
|
||||||
|
prompt_optimizer=optimizer,
|
||||||
|
auto_evolution_config=cfg,
|
||||||
|
)
|
||||||
|
mixin.set_current_module(_make_module())
|
||||||
|
|
||||||
|
task = _make_task()
|
||||||
|
result = _make_failure_result()
|
||||||
|
entry = await mixin.evolve_after_task(task, result)
|
||||||
|
|
||||||
|
assert entry.optimized_module is None # gate not met
|
||||||
|
|
||||||
|
def test_can_optimize_returns_false_below_threshold(self):
|
||||||
|
"""can_optimize() returns False when sample count < min_examples."""
|
||||||
|
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=3)
|
||||||
|
assert optimizer.can_optimize(min_confidence=0.5) is False
|
||||||
|
|
||||||
|
def test_can_optimize_returns_true_above_threshold(self):
|
||||||
|
"""can_optimize() returns True when sample count and confidence met."""
|
||||||
|
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=3)
|
||||||
|
for i in range(3):
|
||||||
|
optimizer.add_example(
|
||||||
|
input_data={"query": f"q_{i}"},
|
||||||
|
output_data={"result": f"r_{i}"},
|
||||||
|
quality_score=0.9,
|
||||||
|
)
|
||||||
|
assert optimizer.can_optimize(min_confidence=0.5) is True
|
||||||
|
|
||||||
|
def test_can_optimize_returns_false_low_confidence(self):
|
||||||
|
"""can_optimize() returns False when mean quality < min_confidence."""
|
||||||
|
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=3)
|
||||||
|
for i in range(3):
|
||||||
|
optimizer.add_example(
|
||||||
|
input_data={"query": f"q_{i}"},
|
||||||
|
output_data={"result": f"r_{i}"},
|
||||||
|
quality_score=0.3, # below 0.5 threshold
|
||||||
|
)
|
||||||
|
# These go to failure_examples (quality < 0.7), so success_examples is empty
|
||||||
|
assert optimizer.can_optimize(min_confidence=0.5) is False
|
||||||
|
|
||||||
|
|
||||||
|
# ── R5: Pitfall confidence threshold ─────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestPitfallConfidence:
|
||||||
|
"""R5: low-confidence pitfalls marked observe-only."""
|
||||||
|
|
||||||
|
def test_compute_confidence_high_sample_high_rate(self):
|
||||||
|
"""3+ occurrences with high failure_rate -> high confidence."""
|
||||||
|
conf = _compute_confidence(failure_rate=0.6, total_occurrences=5)
|
||||||
|
assert conf == pytest.approx(0.6)
|
||||||
|
|
||||||
|
def test_compute_confidence_low_sample(self):
|
||||||
|
"""1 occurrence -> confidence scaled down by 1/3."""
|
||||||
|
conf = _compute_confidence(failure_rate=0.6, total_occurrences=1)
|
||||||
|
assert conf == pytest.approx(0.6 * (1.0 / 3.0))
|
||||||
|
|
||||||
|
def test_compute_confidence_zero_samples(self):
|
||||||
|
"""0 occurrences -> zero confidence."""
|
||||||
|
assert _compute_confidence(failure_rate=0.5, total_occurrences=0) == 0.0
|
||||||
|
|
||||||
|
async def test_low_confidence_pitfall_marked_observe_only(self):
|
||||||
|
"""Pitfall with confidence < min_confidence is marked observe-only."""
|
||||||
|
store = InMemoryExperienceStore(decay_rate=0.01, alpha=0.7)
|
||||||
|
# Only 1 failure experience -> low sample -> low confidence
|
||||||
|
await store.record_experience(
|
||||||
|
_make_experience(
|
||||||
|
task_type="testing",
|
||||||
|
outcome="failure",
|
||||||
|
steps_summary=[
|
||||||
|
{"step_name": "Run Tests", "outcome": "failure", "error": "Flaky"},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
detector = PitfallDetector(
|
||||||
|
experience_store=store,
|
||||||
|
similarity_threshold=0.3,
|
||||||
|
min_confidence=0.5,
|
||||||
|
)
|
||||||
|
|
||||||
|
from agentkit.core.plan_schema import PlanStep, PlanStepStatus
|
||||||
|
|
||||||
|
steps = [
|
||||||
|
PlanStep(
|
||||||
|
step_id="s1",
|
||||||
|
name="Run Tests",
|
||||||
|
description="Run tests",
|
||||||
|
status=PlanStepStatus.PENDING,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
warnings = await detector.check_pitfalls(
|
||||||
|
task_type="testing", planned_steps=steps, actor="test_agent"
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(warnings) == 1
|
||||||
|
assert warnings[0].observe_only is True
|
||||||
|
assert warnings[0].confidence < 0.5
|
||||||
|
assert warnings[0].actor == "test_agent"
|
||||||
|
|
||||||
|
async def test_high_confidence_pitfall_not_observe_only(self):
|
||||||
|
"""Pitfall with confidence >= min_confidence is not observe-only."""
|
||||||
|
store = InMemoryExperienceStore(decay_rate=0.01, alpha=0.7)
|
||||||
|
# 3+ failure experiences -> full sample factor -> high confidence
|
||||||
|
for _ in range(4):
|
||||||
|
await store.record_experience(
|
||||||
|
_make_experience(
|
||||||
|
task_type="deployment",
|
||||||
|
outcome="failure",
|
||||||
|
steps_summary=[
|
||||||
|
{"step_name": "Deploy", "outcome": "failure", "error": "OOM"},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
detector = PitfallDetector(
|
||||||
|
experience_store=store,
|
||||||
|
similarity_threshold=0.3,
|
||||||
|
min_confidence=0.5,
|
||||||
|
)
|
||||||
|
|
||||||
|
from agentkit.core.plan_schema import PlanStep, PlanStepStatus
|
||||||
|
|
||||||
|
steps = [
|
||||||
|
PlanStep(
|
||||||
|
step_id="s1", name="Deploy", description="Deploy app", status=PlanStepStatus.PENDING
|
||||||
|
)
|
||||||
|
]
|
||||||
|
warnings = await detector.check_pitfalls(task_type="deployment", planned_steps=steps)
|
||||||
|
|
||||||
|
assert len(warnings) == 1
|
||||||
|
assert warnings[0].observe_only is False
|
||||||
|
assert warnings[0].confidence >= 0.5
|
||||||
|
|
||||||
|
|
||||||
|
# ── R6: Actor marking ────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestActorMarking:
|
||||||
|
"""R6: actor marking on all evolution artifacts."""
|
||||||
|
|
||||||
|
async def test_log_entry_carries_actor(self):
|
||||||
|
"""EvolutionLogEntry carries the actor identity."""
|
||||||
|
cfg = EvolutionConfig(success_sample_rate=1.0, observe_only=False)
|
||||||
|
reflector = LowQualityReflector()
|
||||||
|
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
|
||||||
|
|
||||||
|
task = _make_task(agent_name="backend_engineer")
|
||||||
|
result = _make_failure_result(agent_name="backend_engineer")
|
||||||
|
entry = await mixin.evolve_after_task(task, result, actor="backend_engineer")
|
||||||
|
|
||||||
|
assert entry.actor == "backend_engineer"
|
||||||
|
|
||||||
|
async def test_actor_defaults_to_result_agent_name(self):
|
||||||
|
"""Actor defaults to result.agent_name when not explicitly provided."""
|
||||||
|
cfg = EvolutionConfig(success_sample_rate=1.0, observe_only=True)
|
||||||
|
reflector = LowQualityReflector()
|
||||||
|
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
|
||||||
|
|
||||||
|
task = _make_task(agent_name="qa_engineer")
|
||||||
|
result = _make_failure_result(agent_name="qa_engineer")
|
||||||
|
entry = await mixin.evolve_after_task(task, result)
|
||||||
|
|
||||||
|
assert entry.actor == "qa_engineer"
|
||||||
|
|
||||||
|
async def test_actor_marked_on_optimized_module(self):
|
||||||
|
"""Optimized Module carries the actor identity."""
|
||||||
|
cfg = EvolutionConfig(success_sample_rate=1.0, observe_only=False, min_confidence=0.0)
|
||||||
|
reflector = LowQualityReflector()
|
||||||
|
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1)
|
||||||
|
for i in range(3):
|
||||||
|
optimizer.add_example(
|
||||||
|
input_data={"query": f"q_{i}"},
|
||||||
|
output_data={"result": f"r_{i}"},
|
||||||
|
quality_score=0.9,
|
||||||
|
)
|
||||||
|
mixin = EvolutionMixin(
|
||||||
|
reflector=reflector,
|
||||||
|
prompt_optimizer=optimizer,
|
||||||
|
auto_evolution_config=cfg,
|
||||||
|
)
|
||||||
|
mixin.set_current_module(_make_module())
|
||||||
|
|
||||||
|
task = _make_task(agent_name="tech_lead")
|
||||||
|
result = _make_failure_result(agent_name="tech_lead")
|
||||||
|
entry = await mixin.evolve_after_task(task, result, actor="tech_lead")
|
||||||
|
|
||||||
|
assert entry.optimized_module is not None
|
||||||
|
assert entry.optimized_module.actor == "tech_lead"
|
||||||
|
|
||||||
|
async def test_actor_in_history(self):
|
||||||
|
"""get_evolution_history includes actor field."""
|
||||||
|
cfg = EvolutionConfig(success_sample_rate=1.0, observe_only=True)
|
||||||
|
reflector = LowQualityReflector()
|
||||||
|
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
|
||||||
|
|
||||||
|
await mixin.evolve_after_task(
|
||||||
|
_make_task(), _make_failure_result(), actor="frontend_engineer"
|
||||||
|
)
|
||||||
|
history = mixin.get_evolution_history()
|
||||||
|
assert len(history) == 1
|
||||||
|
assert history[0]["actor"] == "frontend_engineer"
|
||||||
|
|
||||||
|
async def test_pitfall_warning_carries_actor(self):
|
||||||
|
"""PitfallWarning carries the actor identity."""
|
||||||
|
store = InMemoryExperienceStore(decay_rate=0.01, alpha=0.7)
|
||||||
|
await store.record_experience(
|
||||||
|
_make_experience(
|
||||||
|
task_type="testing",
|
||||||
|
outcome="failure",
|
||||||
|
steps_summary=[
|
||||||
|
{"step_name": "Run Tests", "outcome": "failure", "error": "Error"},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
detector = PitfallDetector(experience_store=store, similarity_threshold=0.3)
|
||||||
|
|
||||||
|
from agentkit.core.plan_schema import PlanStep, PlanStepStatus
|
||||||
|
|
||||||
|
steps = [
|
||||||
|
PlanStep(
|
||||||
|
step_id="s1",
|
||||||
|
name="Run Tests",
|
||||||
|
description="Run tests",
|
||||||
|
status=PlanStepStatus.PENDING,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
warnings = await detector.check_pitfalls(
|
||||||
|
task_type="testing", planned_steps=steps, actor="code_reviewer"
|
||||||
|
)
|
||||||
|
assert len(warnings) == 1
|
||||||
|
assert warnings[0].actor == "code_reviewer"
|
||||||
|
|
||||||
|
|
||||||
|
# ── R6: Cross-workspace sharing ──────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestCrossWorkspaceSharing:
|
||||||
|
"""R6: cross-workspace sharing defaults off; same-workspace always on."""
|
||||||
|
|
||||||
|
def test_same_workspace_sharing_always_allowed(self):
|
||||||
|
"""Same-actor sharing is always allowed."""
|
||||||
|
mixin = EvolutionMixin(reflector=Reflector())
|
||||||
|
assert mixin.can_share_artifact("agent_a", "agent_a") is True
|
||||||
|
|
||||||
|
def test_cross_workspace_sharing_default_off(self):
|
||||||
|
"""Cross-workspace sharing rejected without opt-in (default)."""
|
||||||
|
cfg = EvolutionConfig(cross_workspace_sharing=False)
|
||||||
|
mixin = EvolutionMixin(reflector=Reflector(), auto_evolution_config=cfg)
|
||||||
|
assert mixin.can_share_artifact("agent_a", "agent_b") is False
|
||||||
|
|
||||||
|
def test_cross_workspace_sharing_with_opt_in(self):
|
||||||
|
"""Cross-workspace sharing allowed when explicitly opted in."""
|
||||||
|
cfg = EvolutionConfig(cross_workspace_sharing=True)
|
||||||
|
mixin = EvolutionMixin(reflector=Reflector(), auto_evolution_config=cfg)
|
||||||
|
assert mixin.can_share_artifact("agent_a", "agent_b") is True
|
||||||
|
|
||||||
|
def test_no_config_cross_workspace_rejected(self):
|
||||||
|
"""Without config, cross-workspace sharing is rejected (safe default)."""
|
||||||
|
mixin = EvolutionMixin(reflector=Reflector())
|
||||||
|
assert mixin.can_share_artifact("agent_a", "agent_b") is False
|
||||||
|
|
||||||
|
|
||||||
|
# ── KTD-8: gave_up_after_reflections ─────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestGaveUpAfterReflections:
|
||||||
|
"""KTD-8: gave_up_after_reflections triggers failure-path evolution."""
|
||||||
|
|
||||||
|
async def test_gave_up_treated_as_failure(self):
|
||||||
|
"""gave_up_after_reflections in output_data triggers failure path."""
|
||||||
|
cfg = EvolutionConfig(success_sample_rate=0.0, observe_only=True)
|
||||||
|
reflector = LowQualityReflector()
|
||||||
|
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
|
||||||
|
|
||||||
|
task = _make_task()
|
||||||
|
# status=COMPLETED but trace_outcome=gave_up_after_reflections
|
||||||
|
result = _make_result(
|
||||||
|
status=TaskStatus.COMPLETED,
|
||||||
|
output_data={"trace_outcome": "gave_up_after_reflections"},
|
||||||
|
)
|
||||||
|
entry = await mixin.evolve_after_task(task, result)
|
||||||
|
|
||||||
|
# Even though success_sample_rate=0.0, failure path always runs
|
||||||
|
assert entry.sampled is True
|
||||||
|
assert entry.reflection is not None
|
||||||
|
|
||||||
|
async def test_gave_up_in_error_message_treated_as_failure(self):
|
||||||
|
"""gave_up_after_reflections in error_message triggers failure path."""
|
||||||
|
cfg = EvolutionConfig(success_sample_rate=0.0, observe_only=True)
|
||||||
|
reflector = LowQualityReflector()
|
||||||
|
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
|
||||||
|
|
||||||
|
task = _make_task()
|
||||||
|
result = _make_result(
|
||||||
|
status=TaskStatus.COMPLETED,
|
||||||
|
output_data={"content": "some output"},
|
||||||
|
error_message="gave_up_after_reflections: exhausted reinjections",
|
||||||
|
)
|
||||||
|
entry = await mixin.evolve_after_task(task, result)
|
||||||
|
|
||||||
|
assert entry.sampled is True
|
||||||
|
assert entry.reflection is not None
|
||||||
|
|
||||||
|
def test_is_failure_path_normal_success(self):
|
||||||
|
"""Normal success (COMPLETED, no gave_up signal) is not failure path."""
|
||||||
|
mixin = EvolutionMixin(reflector=Reflector())
|
||||||
|
result = _make_result(status=TaskStatus.COMPLETED, output_data={"key": "val"})
|
||||||
|
assert mixin._is_failure_path(result) is False
|
||||||
|
|
||||||
|
def test_is_failure_path_failed_status(self):
|
||||||
|
"""FAILED status is failure path."""
|
||||||
|
mixin = EvolutionMixin(reflector=Reflector())
|
||||||
|
result = _make_result(status=TaskStatus.FAILED, output_data=None)
|
||||||
|
assert mixin._is_failure_path(result) is True
|
||||||
|
|
||||||
|
def test_is_failure_path_cancelled_status(self):
|
||||||
|
"""CANCELLED status is failure path."""
|
||||||
|
mixin = EvolutionMixin(reflector=Reflector())
|
||||||
|
result = _make_result(status=TaskStatus.CANCELLED, output_data=None)
|
||||||
|
assert mixin._is_failure_path(result) is True
|
||||||
|
|
||||||
|
|
||||||
|
# ── Error handling: evolution does not fail the stream ───
|
||||||
|
|
||||||
|
|
||||||
|
class TestEvolutionErrorHandling:
|
||||||
|
"""Evolution task error is caught and does not propagate to the caller.
|
||||||
|
|
||||||
|
The _evolve_safe wrapper in config_driven.py catches all exceptions from
|
||||||
|
evolve_after_task. These tests verify that pattern.
|
||||||
|
"""
|
||||||
|
|
||||||
|
async def test_evolve_safe_swallows_reflector_error(self):
|
||||||
|
"""_evolve_safe pattern: reflector error is caught, not propagated."""
|
||||||
|
|
||||||
|
class SafeWrapper(EvolutionMixin):
|
||||||
|
"""Simulates the _evolve_safe pattern from ConfigDrivenAgent."""
|
||||||
|
|
||||||
|
async def _evolve_safe(self, task: TaskMessage, result: TaskResult) -> None:
|
||||||
|
try:
|
||||||
|
await self.evolve_after_task(task, result)
|
||||||
|
except Exception:
|
||||||
|
pass # swallowed, matching config_driven.py:_evolve_safe
|
||||||
|
|
||||||
|
mixin = SafeWrapper(reflector=ErrorReflector())
|
||||||
|
# Should not raise
|
||||||
|
await mixin._evolve_safe(_make_task(), _make_failure_result())
|
||||||
|
|
||||||
|
async def test_apply_change_error_does_not_crash_evolution(self):
|
||||||
|
"""_apply_change errors are caught internally (existing behavior)."""
|
||||||
|
cfg = EvolutionConfig(success_sample_rate=1.0, observe_only=False, min_confidence=0.0)
|
||||||
|
reflector = LowQualityReflector()
|
||||||
|
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1)
|
||||||
|
for i in range(3):
|
||||||
|
optimizer.add_example(
|
||||||
|
input_data={"query": f"q_{i}"},
|
||||||
|
output_data={"result": f"r_{i}"},
|
||||||
|
quality_score=0.9,
|
||||||
|
)
|
||||||
|
mixin = EvolutionMixin(
|
||||||
|
reflector=reflector,
|
||||||
|
prompt_optimizer=optimizer,
|
||||||
|
auto_evolution_config=cfg,
|
||||||
|
)
|
||||||
|
mixin.set_current_module(_make_module())
|
||||||
|
|
||||||
|
# Should complete without raising even if internal steps have issues
|
||||||
|
entry = await mixin.evolve_after_task(_make_task(), _make_failure_result())
|
||||||
|
assert entry is not None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Integration: fire-and-forget via asyncio.create_task ─
|
||||||
|
|
||||||
|
|
||||||
|
class TestFireAndForgetIntegration:
|
||||||
|
"""Evolution fires via U2's execute_stream hooks (fire-and-forget pattern).
|
||||||
|
|
||||||
|
Validates that evolve_after_task works correctly when scheduled as a
|
||||||
|
fire-and-forget asyncio task, matching _trigger_evolution_hooks behavior.
|
||||||
|
"""
|
||||||
|
|
||||||
|
async def test_evolve_after_task_completes_as_asyncio_task(self):
|
||||||
|
"""evolve_after_task completes when scheduled via asyncio.create_task."""
|
||||||
|
cfg = EvolutionConfig(success_sample_rate=1.0, observe_only=True)
|
||||||
|
reflector = LowQualityReflector()
|
||||||
|
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
|
||||||
|
|
||||||
|
task = _make_task()
|
||||||
|
result = _make_failure_result()
|
||||||
|
|
||||||
|
# Schedule as fire-and-forget task (mirrors _schedule_evolution)
|
||||||
|
async def _evolve():
|
||||||
|
await mixin.evolve_after_task(task, result)
|
||||||
|
|
||||||
|
t = asyncio.create_task(_evolve())
|
||||||
|
await t # wait for completion
|
||||||
|
|
||||||
|
history = mixin.get_evolution_history()
|
||||||
|
assert len(history) == 1
|
||||||
|
assert history[0]["reflection"] is not None
|
||||||
|
|
||||||
|
async def test_concurrent_evolution_tasks_isolated(self):
|
||||||
|
"""Multiple concurrent evolution tasks don't interfere."""
|
||||||
|
cfg = EvolutionConfig(success_sample_rate=1.0, observe_only=True)
|
||||||
|
reflector = LowQualityReflector()
|
||||||
|
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
|
||||||
|
|
||||||
|
async def _run_one(task_id: str):
|
||||||
|
await mixin.evolve_after_task(
|
||||||
|
_make_task(task_id=task_id),
|
||||||
|
_make_failure_result(task_id=task_id),
|
||||||
|
)
|
||||||
|
|
||||||
|
await asyncio.gather(
|
||||||
|
_run_one("task-a"),
|
||||||
|
_run_one("task-b"),
|
||||||
|
_run_one("task-c"),
|
||||||
|
)
|
||||||
|
|
||||||
|
history = mixin.get_evolution_history()
|
||||||
|
assert len(history) == 3
|
||||||
|
task_ids = {h["task_id"] for h in history}
|
||||||
|
assert task_ids == {"task-a", "task-b", "task-c"}
|
||||||
|
|
||||||
|
|
||||||
|
# ── Backpressure cap (U2 _schedule_evolution) ────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestBackpressureCap:
|
||||||
|
"""Backpressure cap reached -> evolution task dropped + logged.
|
||||||
|
|
||||||
|
Tests U2's _schedule_evolution backpressure, which U6's auto-trigger relies on.
|
||||||
|
"""
|
||||||
|
|
||||||
|
async def test_evolution_task_dropped_when_cap_reached(self):
|
||||||
|
"""When pending tasks reach cap, new evolution tasks are dropped."""
|
||||||
|
import agentkit.core.config_driven as cd
|
||||||
|
|
||||||
|
# Save original state to restore after test
|
||||||
|
try:
|
||||||
|
# Create blocking coroutines that won't complete during the test
|
||||||
|
block_event = asyncio.Event()
|
||||||
|
|
||||||
|
async def _blocking_evolve() -> None:
|
||||||
|
await block_event.wait()
|
||||||
|
|
||||||
|
cap = 4
|
||||||
|
# Fill up to cap
|
||||||
|
for _ in range(cap):
|
||||||
|
cd._schedule_evolution(_blocking_evolve(), cap=cap)
|
||||||
|
|
||||||
|
assert len(cd._pending_evolution_tasks) == cap
|
||||||
|
|
||||||
|
# Track dropped count before (access via module — int is immutable)
|
||||||
|
dropped_before = cd._evolution_dropped_count
|
||||||
|
|
||||||
|
# Try to schedule one more -> should be dropped
|
||||||
|
cd._schedule_evolution(_blocking_evolve(), cap=cap)
|
||||||
|
|
||||||
|
assert len(cd._pending_evolution_tasks) == cap # still at cap
|
||||||
|
assert cd._evolution_dropped_count == dropped_before + 1
|
||||||
|
|
||||||
|
# Release the blocking tasks so they can complete and be cleaned up
|
||||||
|
block_event.set()
|
||||||
|
# Let the event loop process task completions
|
||||||
|
await asyncio.sleep(0.05)
|
||||||
|
finally:
|
||||||
|
# Restore: clean up any remaining tasks
|
||||||
|
block_event = asyncio.Event()
|
||||||
|
block_event.set()
|
||||||
|
# Wait for any stragglers
|
||||||
|
if cd._pending_evolution_tasks:
|
||||||
|
await asyncio.gather(*cd._pending_evolution_tasks, return_exceptions=True)
|
||||||
|
cd._pending_evolution_tasks.clear()
|
||||||
|
|
||||||
|
|
||||||
|
# ── AE3: Happy path — pitfall detection ──────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class TestAE3HappyPath:
|
||||||
|
"""AE3: task fails -> evolution fires (100%) -> Reflector records ->
|
||||||
|
PitfallDetector detects; task succeeds -> evolution fires at 0.1 rate.
|
||||||
|
"""
|
||||||
|
|
||||||
|
async def test_failure_triggers_evolution_and_pitfall_detection(self):
|
||||||
|
"""Full happy path: failure -> evolution -> pitfall detection."""
|
||||||
|
# 1. Evolution fires on failure (100%)
|
||||||
|
cfg = EvolutionConfig(success_sample_rate=0.0, observe_only=True)
|
||||||
|
reflector = LowQualityReflector()
|
||||||
|
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
|
||||||
|
|
||||||
|
task = _make_task()
|
||||||
|
result = _make_failure_result()
|
||||||
|
entry = await mixin.evolve_after_task(task, result)
|
||||||
|
assert entry.reflection is not None
|
||||||
|
assert entry.reflection.outcome == "failure"
|
||||||
|
|
||||||
|
# 2. PitfallDetector detects high-failure-rate step
|
||||||
|
store = InMemoryExperienceStore(decay_rate=0.01, alpha=0.7)
|
||||||
|
for _ in range(6):
|
||||||
|
await store.record_experience(
|
||||||
|
_make_experience(
|
||||||
|
task_type="order_processing",
|
||||||
|
outcome="failure",
|
||||||
|
steps_summary=[
|
||||||
|
{"step_name": "Call API", "outcome": "failure", "error": "timeout"},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
for _ in range(4):
|
||||||
|
await store.record_experience(
|
||||||
|
_make_experience(
|
||||||
|
task_type="order_processing",
|
||||||
|
outcome="success",
|
||||||
|
success_rate=1.0,
|
||||||
|
steps_summary=[
|
||||||
|
{"step_name": "Call API", "outcome": "success"},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
detector = PitfallDetector(experience_store=store, similarity_threshold=0.3)
|
||||||
|
from agentkit.core.plan_schema import PlanStep, PlanStepStatus
|
||||||
|
|
||||||
|
steps = [
|
||||||
|
PlanStep(
|
||||||
|
step_id="s1",
|
||||||
|
name="Call API",
|
||||||
|
description="Call external API",
|
||||||
|
status=PlanStepStatus.PENDING,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
warnings = await detector.check_pitfalls(task_type="order_processing", planned_steps=steps)
|
||||||
|
|
||||||
|
assert len(warnings) == 1
|
||||||
|
assert warnings[0].warning_level == WarningLevel.HIGH
|
||||||
|
assert warnings[0].failure_rate >= 0.5
|
||||||
|
|
||||||
|
async def test_success_sampled_at_0_1_rate(self):
|
||||||
|
"""Success path: with rate=0.1, ~10% of tasks trigger evolution."""
|
||||||
|
cfg = EvolutionConfig(success_sample_rate=0.1, observe_only=True)
|
||||||
|
reflector = SuccessReflector()
|
||||||
|
|
||||||
|
triggered = 0
|
||||||
|
total = 100
|
||||||
|
for _ in range(total):
|
||||||
|
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
|
||||||
|
entry = await mixin.evolve_after_task(
|
||||||
|
_make_task(), _make_result(status=TaskStatus.COMPLETED)
|
||||||
|
)
|
||||||
|
if entry.reflection is not None:
|
||||||
|
triggered += 1
|
||||||
|
|
||||||
|
# With rate=0.1 over 100 trials, expect ~10 (allow wide tolerance)
|
||||||
|
# ponytail: statistical test; flaky at extreme bounds. Upgrade to
|
||||||
|
# deterministic mock if CI reliability becomes an issue.
|
||||||
|
assert 1 <= triggered <= 25
|
||||||
Loading…
Reference in New Issue