feat: complex-task-quality-loop (R1-R12) #22

Merged
fischer merged 13 commits from feat/complex-task-quality-loop into main 2026-07-05 22:31:22 +08:00
6 changed files with 1198 additions and 56 deletions
Showing only changes of commit 91a61f9b49 - Show all commits

View File

@ -11,6 +11,7 @@ from agentkit.evolution.prompt_optimizer import (
) )
from agentkit.evolution.strategy_tuner import StrategyTuner from agentkit.evolution.strategy_tuner import StrategyTuner
from agentkit.evolution.ab_tester import ABTester from agentkit.evolution.ab_tester import ABTester
from agentkit.evolution.config import EvolutionConfig
from agentkit.evolution.evolution_store import ( from agentkit.evolution.evolution_store import (
EvolutionStore, EvolutionStore,
EvolutionStoreProtocol, EvolutionStoreProtocol,
@ -30,6 +31,7 @@ __all__ = [
"Module", "Module",
"StrategyTuner", "StrategyTuner",
"ABTester", "ABTester",
"EvolutionConfig",
"EvolutionStore", "EvolutionStore",
"EvolutionStoreProtocol", "EvolutionStoreProtocol",
"PersistentEvolutionStore", "PersistentEvolutionStore",

View File

@ -0,0 +1,43 @@
"""EvolutionConfig - auto-evolution trigger configuration (U6, R5/R6).
R5: success sample rate gates success-path evolution at evolve_after_task() entry;
failure path always runs (100%). Quality gates (min_confidence, min_examples)
prevent noise-driven prompt degradation.
R6: actor marking on all evolution artifacts; cross-workspace sharing defaults off.
"""
from __future__ import annotations
from pydantic import BaseModel, ConfigDict, Field
class EvolutionConfig(BaseModel):
"""Configuration for auto-evolution triggering and quality gates.
Attributes:
success_sample_rate: Fraction of success-path tasks that trigger evolution
(``random.random() < rate``). Failure path always runs (100%).
Default 0.1 1 in 10 successful tasks feed evolution.
min_confidence: Minimum confidence for pitfall ingestion and optimizer
consumption. Low-confidence pitfalls are marked observe-only.
min_examples: Minimum sample count before PromptOptimizer may consume
them. Pairs with min_confidence as a two-part consumption gate.
observe_only: When True, reflections/examples are recorded but NOT fed
to the optimizer. Avoids noise-driven prompt degradation (RV14)
during initial rollout. Set False once signal quality is validated.
cross_workspace_sharing: When False (default), evolution artifacts
(pitfalls, optimized prompts) are NOT shared across agent/expert
workspaces. Same-workspace sharing is always on. Cross-workspace
requires explicit opt-in (R6 trust boundary).
actor_marking: When True, stamp the producing agent/expert identity on
all evolution artifacts for traceability (R6).
"""
model_config = ConfigDict(extra="forbid")
success_sample_rate: float = Field(default=0.1, ge=0.0, le=1.0)
min_confidence: float = Field(default=0.5, ge=0.0, le=1.0)
min_examples: int = Field(default=3, ge=1)
observe_only: bool = True
cross_workspace_sharing: bool = False
actor_marking: bool = True

View File

@ -4,14 +4,16 @@
""" """
import logging import logging
import random
from dataclasses import dataclass, field from dataclasses import dataclass, field
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import Any from typing import Any
from sqlalchemy.exc import DBAPIError from sqlalchemy.exc import DBAPIError
from agentkit.core.protocol import EvolutionEvent, TaskMessage, TaskResult from agentkit.core.protocol import EvolutionEvent, TaskMessage, TaskResult, TaskStatus
from agentkit.evolution.ab_tester import ABTestConfig, ABTestResult, ABTester from agentkit.evolution.ab_tester import ABTestConfig, ABTestResult, ABTester
from agentkit.evolution.config import EvolutionConfig
from agentkit.evolution.evolution_store import EvolutionStore from agentkit.evolution.evolution_store import EvolutionStore
from agentkit.evolution.llm_reflector import LLMReflector from agentkit.evolution.llm_reflector import LLMReflector
from agentkit.evolution.prompt_optimizer import ( from agentkit.evolution.prompt_optimizer import (
@ -39,6 +41,7 @@ class SoulEvolutionConfig:
@dataclass @dataclass
class EvolutionLogEntry: class EvolutionLogEntry:
"""进化日志条目""" """进化日志条目"""
task_id: str task_id: str
reflection: Reflection | None = None reflection: Reflection | None = None
optimized_module: Module | None = None optimized_module: Module | None = None
@ -47,6 +50,12 @@ class EvolutionLogEntry:
rolled_back: bool = False rolled_back: bool = False
event_id: str | None = None event_id: str | None = None
created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc)) created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
# R6: actor marking — which agent/expert produced this evolution artifact
actor: str = ""
# R5: whether this entry was gated by the success sample rate
sampled: bool = True
# R5: observe-only entries record but do not mutate prompts
observe_only: bool = False
class EvolutionMixin: class EvolutionMixin:
@ -73,15 +82,14 @@ class EvolutionMixin:
auxiliary_model: str | None = None, auxiliary_model: str | None = None,
strategy_tuning_enabled: bool = False, strategy_tuning_enabled: bool = False,
evolution_config: SoulEvolutionConfig | None = None, evolution_config: SoulEvolutionConfig | None = None,
auto_evolution_config: EvolutionConfig | None = None,
): ):
if reflector is not EvolutionMixin._UNSET: if reflector is not EvolutionMixin._UNSET:
# 显式传入了 reflector 参数(包括 None # 显式传入了 reflector 参数(包括 None
self._reflector = reflector self._reflector = reflector
elif reflector_type is not None: elif reflector_type is not None:
# 未传入 reflector但指定了 reflector_type → 自动创建 # 未传入 reflector但指定了 reflector_type → 自动创建
self._reflector = self._create_reflector( self._reflector = self._create_reflector(reflector_type, llm_gateway, auxiliary_model)
reflector_type, llm_gateway, auxiliary_model
)
else: else:
# 都未指定保持向后兼容reflector 为 None # 都未指定保持向后兼容reflector 为 None
self._reflector = None self._reflector = None
@ -93,6 +101,8 @@ class EvolutionMixin:
self._current_module: Module | None = None self._current_module: Module | None = None
self._strategy_tuning_enabled = strategy_tuning_enabled self._strategy_tuning_enabled = strategy_tuning_enabled
self._evolution_config = evolution_config self._evolution_config = evolution_config
# U6/R5/R6: auto-evolution config (sample rate, quality gates, actor marking)
self._auto_evolution_config = auto_evolution_config
self.pending_soul_updates: dict[str, list] = {} self.pending_soul_updates: dict[str, list] = {}
@staticmethod @staticmethod
@ -133,19 +143,43 @@ class EvolutionMixin:
task: TaskMessage, task: TaskMessage,
result: TaskResult, result: TaskResult,
memory_store: MemoryStore | None = None, memory_store: MemoryStore | None = None,
actor: str | None = None,
) -> EvolutionLogEntry: ) -> EvolutionLogEntry:
"""任务完成后执行进化流程。 """任务完成后执行进化流程。
流程 流程
1. Reflector 反思 得到 Reflection 1. R5 成功采样门控 auto_evolution_config 配置时生效
2. Soul 进化检查如果 memory_store 可用 2. Reflector 反思 得到 Reflection
3. 如果 Reflection 有改进建议 PromptOptimizer 优化 3. Soul 进化检查如果 memory_store 可用
4. 如果优化产生了新 Prompt ABTester 验证 4. 如果 Reflection 有改进建议 PromptOptimizer 优化
5. 如果 AB 测试通过 EvolutionStore 应用变更 5. 如果优化产生了新 Prompt ABTester 验证
6. 如果 AB 测试失败 回滚 6. 如果 AB 测试通过 EvolutionStore 应用变更
7. 如果策略调优启用 StrategyTuner 调优 7. 如果 AB 测试失败 回滚
8. 如果策略调优启用 StrategyTuner 调优
R5: 成功路径按 success_sample_rate 采样失败路径始终执行100%
R6: 所有进化产物携带 actor 标记
KTD-8: gave_up_after_reflections 视为失败路径
""" """
log_entry = EvolutionLogEntry(task_id=task.task_id) # R6: actor marking — defaults to the agent that produced the result
resolved_actor = actor or result.agent_name or ""
log_entry = EvolutionLogEntry(task_id=task.task_id, actor=resolved_actor)
cfg = self._auto_evolution_config
# R5: success sample rate gate — only when auto_evolution_config is set.
# Failure path always runs (100%). KTD-8: gave_up_after_reflections = failure.
is_failure = self._is_failure_path(result)
if cfg is not None and not is_failure:
if random.random() >= cfg.success_sample_rate:
logger.debug(
"Success-path evolution skipped for task %s (sample rate %.2f)",
task.task_id,
cfg.success_sample_rate,
)
log_entry.sampled = False
self._evolution_log.append(log_entry)
return log_entry
# Step 1: 反思 # Step 1: 反思
if self._reflector is None: if self._reflector is None:
@ -177,16 +211,46 @@ class EvolutionMixin:
self._evolution_log.append(log_entry) self._evolution_log.append(log_entry)
return log_entry return log_entry
# R5: observe-only mode — record reflection but do NOT feed optimizer.
# Avoids noise-driven prompt degradation during initial rollout (RV14).
if cfg is not None and cfg.observe_only:
logger.debug(
"Observe-only mode: recording reflection without feeding optimizer for task %s",
task.task_id,
)
log_entry.observe_only = True
self._evolution_log.append(log_entry)
return log_entry
# R5: consumption gate — sample count >= min_examples AND confidence达标.
min_conf = cfg.min_confidence if cfg is not None else 0.5
min_examples = cfg.min_examples if cfg is not None else 3
if hasattr(self._prompt_optimizer, "can_optimize"):
if not self._prompt_optimizer.can_optimize(
min_confidence=min_conf, min_examples=min_examples
):
logger.debug(
"Optimizer consumption gate not met for task %s, skipping optimization",
task.task_id,
)
self._evolution_log.append(log_entry)
return log_entry
# 将反思结果作为训练样本 # 将反思结果作为训练样本
self._prompt_optimizer.add_example( self._prompt_optimizer.add_example(
input_data=task.input_data, input_data=task.input_data,
output_data=result.output_data or {}, output_data=result.output_data or {},
quality_score=reflection.quality_score, quality_score=reflection.quality_score,
actor=resolved_actor,
) )
# Pass trace and reflection to LLMPromptOptimizer if available # Pass trace and reflection to LLMPromptOptimizer if available
optimized = await self._optimize_with_context(self._current_module, reflection) optimized = await self._optimize_with_context(self._current_module, reflection)
# R6: stamp actor on optimized module
if cfg is None or cfg.actor_marking:
optimized.actor = resolved_actor
# 检查是否真正产生了变化 # 检查是否真正产生了变化
if optimized.name == self._current_module.name and not optimized.demos: if optimized.name == self._current_module.name and not optimized.demos:
logger.debug("Optimization produced no meaningful changes") logger.debug("Optimization produced no meaningful changes")
@ -240,9 +304,43 @@ class EvolutionMixin:
self._evolution_log.append(log_entry) self._evolution_log.append(log_entry)
return log_entry return log_entry
async def _optimize_with_context( def _is_failure_path(self, result: TaskResult) -> bool:
self, module: Module, reflection: Reflection """Determine if a result should trigger failure-path evolution (100%).
) -> Module:
KTD-8: ``gave_up_after_reflections`` (U5) is treated as failure even when
the stream wrapper marks status as COMPLETED, because the reflexion loop
exhausted without producing a verified answer.
ponytail: string-matching on output_data/error_message is a heuristic;
upgrade path is a dedicated TaskResult.trace_outcome field.
"""
if result.status != TaskStatus.COMPLETED:
return True
# KTD-8: detect gave_up_after_reflections signal carried in output or error
if result.output_data and isinstance(result.output_data, dict):
if result.output_data.get("trace_outcome") == "gave_up_after_reflections":
return True
if result.error_message and "gave_up_after_reflections" in result.error_message:
return True
return False
def can_share_artifact(self, source_actor: str, target_actor: str) -> bool:
"""R6: check if an evolution artifact can be shared between workspaces.
Same-workspace sharing is always on. Cross-workspace sharing requires
explicit opt-in via ``EvolutionConfig.cross_workspace_sharing=True``.
Trust boundary: evolution products are agent-produced and must be
validated before entering the shared store.
"""
if source_actor == target_actor:
return True
cfg = self._auto_evolution_config
if cfg is not None and cfg.cross_workspace_sharing:
return True
return False
async def _optimize_with_context(self, module: Module, reflection: Reflection) -> Module:
"""Run optimization, passing reflection context if optimizer supports it""" """Run optimization, passing reflection context if optimizer supports it"""
from agentkit.evolution.prompt_optimizer import LLMPromptOptimizer from agentkit.evolution.prompt_optimizer import LLMPromptOptimizer
@ -263,11 +361,13 @@ class EvolutionMixin:
# Create test if not exists # Create test if not exists
if test_id not in self._ab_tester._tests: if test_id not in self._ab_tester._tests:
self._ab_tester.create_test(ABTestConfig( self._ab_tester.create_test(
test_id=test_id, ABTestConfig(
agent_name=result.agent_name, test_id=test_id,
change_type="prompt", agent_name=result.agent_name,
)) change_type="prompt",
)
)
# Assign group deterministically based on task_id # Assign group deterministically based on task_id
group = self._ab_tester.assign_group(test_id, task_id=task.task_id) group = self._ab_tester.assign_group(test_id, task_id=task.task_id)
@ -318,6 +418,9 @@ class EvolutionMixin:
"rolled_back": entry.rolled_back, "rolled_back": entry.rolled_back,
"event_id": entry.event_id, "event_id": entry.event_id,
"created_at": entry.created_at.isoformat(), "created_at": entry.created_at.isoformat(),
"actor": entry.actor,
"sampled": entry.sampled,
"observe_only": entry.observe_only,
} }
if entry.reflection: if entry.reflection:
record["reflection"] = { record["reflection"] = {
@ -444,9 +547,7 @@ class EvolutionMixin:
# 按 pattern 分类累积反思patterns为空时使用默认category # 按 pattern 分类累积反思patterns为空时使用默认category
categories = reflection.patterns if reflection.patterns else ["default"] categories = reflection.patterns if reflection.patterns else ["default"]
for pattern in categories: for pattern in categories:
self.record_reflection( self.record_reflection(pattern, reflection, task_type=task_type, score=score)
pattern, reflection, task_type=task_type, score=score
)
# 检查是否有类别满足触发条件 # 检查是否有类别满足触发条件
for category, reflections in list(self.pending_soul_updates.items()): for category, reflections in list(self.pending_soul_updates.items()):
@ -455,9 +556,7 @@ class EvolutionMixin:
quality_gradient_triggered = False quality_gradient_triggered = False
if len(scores) >= 3: if len(scores) >= 3:
last_3 = scores[-3:] last_3 = scores[-3:]
declines = [ declines = [last_3[i] - last_3[i - 1] for i in range(1, len(last_3))]
last_3[i] - last_3[i - 1] for i in range(1, len(last_3))
]
if all(d <= config.quality_gradient_threshold for d in declines): if all(d <= config.quality_gradient_threshold for d in declines):
quality_gradient_triggered = True quality_gradient_triggered = True
@ -467,7 +566,7 @@ class EvolutionMixin:
for r in reflections: for r in reflections:
age_seconds = (now - r["timestamp"]).total_seconds() age_seconds = (now - r["timestamp"]).total_seconds()
age_hours = age_seconds / 3600.0 age_hours = age_seconds / 3600.0
effective_count += config.time_decay_factor ** age_hours effective_count += config.time_decay_factor**age_hours
# Round to avoid floating-point precision issues # Round to avoid floating-point precision issues
# (e.g. 3 recent reflections should yield exactly 3.0) # (e.g. 3 recent reflections should yield exactly 3.0)
effective_count = round(effective_count, 6) effective_count = round(effective_count, 6)
@ -506,8 +605,7 @@ class EvolutionMixin:
if update_result.get("success"): if update_result.get("success"):
logger.info( logger.info(
f"Soul evolved: category={category}, " f"Soul evolved: category={category}, version={update_result.get('version')}"
f"version={update_result.get('version')}"
) )
# 清除已处理的类别 # 清除已处理的类别
del self.pending_soul_updates[category] del self.pending_soul_updates[category]

View File

@ -33,6 +33,9 @@ class PitfallWarning:
failure_rate: 历史失败率0.0 ~ 1.0 failure_rate: 历史失败率0.0 ~ 1.0
historical_failures: 历史失败原因列表 historical_failures: 历史失败原因列表
suggestion: 优化建议 suggestion: 优化建议
confidence: 置信度0.0 ~ 1.0综合 failure_rate 和样本量计算
actor: 产生此预警对应的 agent/expert 标识R6 actor marking
observe_only: 低置信度预警标记为 observe-only记录但不驱动优化
""" """
step_name: str step_name: str
@ -40,6 +43,12 @@ class PitfallWarning:
failure_rate: float failure_rate: float
historical_failures: list[str] = field(default_factory=list) historical_failures: list[str] = field(default_factory=list)
suggestion: str = "" suggestion: str = ""
# U6/R5: confidence score for quality gate before ingestion
confidence: float = 0.0
# U6/R6: actor marking — which agent/expert produced the underlying experiences
actor: str = ""
# U6/R5: low-confidence warnings are marked observe-only (not discarded)
observe_only: bool = False
class ExperienceStoreProtocol(Protocol): class ExperienceStoreProtocol(Protocol):
@ -51,8 +60,7 @@ class ExperienceStoreProtocol(Protocol):
top_k: int = 5, top_k: int = 5,
task_type: str | None = None, task_type: str | None = None,
search_multiplier: int = 5, search_multiplier: int = 5,
) -> list[Any]: ) -> list[Any]: ...
...
# 预警级别阈值 # 预警级别阈值
@ -89,27 +97,33 @@ class PitfallDetector:
experience_store: ExperienceStoreProtocol, experience_store: ExperienceStoreProtocol,
similarity_threshold: float = 0.3, similarity_threshold: float = 0.3,
max_search_results: int = 50, max_search_results: int = 50,
min_confidence: float = 0.0,
): ):
""" """
Args: Args:
experience_store: 经验存储实例ExperienceStore InMemoryExperienceStore experience_store: 经验存储实例ExperienceStore InMemoryExperienceStore
similarity_threshold: 步骤名称关键词匹配的最小相似度阈值 similarity_threshold: 步骤名称关键词匹配的最小相似度阈值
max_search_results: 从经验存储检索的最大结果数 max_search_results: 从经验存储检索的最大结果数
min_confidence: 置信度阈值U6/R5低于此值的预警标记为 observe_only
默认 0.0 表示不过滤保持向后兼容
""" """
self._store = experience_store self._store = experience_store
self._similarity_threshold = similarity_threshold self._similarity_threshold = similarity_threshold
self._max_search_results = max_search_results self._max_search_results = max_search_results
self._min_confidence = min_confidence
async def check_pitfalls( async def check_pitfalls(
self, self,
task_type: str, task_type: str,
planned_steps: list[Any], planned_steps: list[Any],
actor: str = "",
) -> list[PitfallWarning]: ) -> list[PitfallWarning]:
"""检查计划步骤中的潜在陷阱 """检查计划步骤中的潜在陷阱
Args: Args:
task_type: 任务类型 task_type: 任务类型
planned_steps: 计划步骤列表PlanStep 对象或具有 name/description 属性的对象 planned_steps: 计划步骤列表PlanStep 对象或具有 name/description 属性的对象
actor: 产生此检测请求的 agent/expert 标识R6 actor marking
Returns: Returns:
按严重程度排序的预警列表HIGH MEDIUM LOW 按严重程度排序的预警列表HIGH MEDIUM LOW
@ -127,7 +141,7 @@ class PitfallDetector:
step_failure_stats = self._extract_step_failure_stats(all_experiences) step_failure_stats = self._extract_step_failure_stats(all_experiences)
# 3. 匹配当前计划步骤并生成预警 # 3. 匹配当前计划步骤并生成预警
warnings = self._match_and_warn(planned_steps, step_failure_stats) warnings = self._match_and_warn(planned_steps, step_failure_stats, actor=actor)
# 4. 按严重程度排序HIGH → MEDIUM → LOW同级别按失败率降序 # 4. 按严重程度排序HIGH → MEDIUM → LOW同级别按失败率降序
warnings.sort(key=lambda w: (_warning_level_order(w.warning_level), -w.failure_rate)) warnings.sort(key=lambda w: (_warning_level_order(w.warning_level), -w.failure_rate))
@ -208,8 +222,8 @@ class PitfallDetector:
s.failure_reasons.append(error) s.failure_reasons.append(error)
# 收集优化建议 — only add to steps that are part of this experience # 收集优化建议 — only add to steps that are part of this experience
if hasattr(exp, 'optimization_tips') and exp.optimization_tips: if hasattr(exp, "optimization_tips") and exp.optimization_tips:
experience_steps = set(exp.steps) if hasattr(exp, 'steps') and exp.steps else set() experience_steps = set(exp.steps) if hasattr(exp, "steps") and exp.steps else set()
for step_name, s in stats.items(): for step_name, s in stats.items():
if experience_steps and step_name in experience_steps: if experience_steps and step_name in experience_steps:
s.optimization_tips.extend(exp.optimization_tips) s.optimization_tips.extend(exp.optimization_tips)
@ -220,6 +234,7 @@ class PitfallDetector:
self, self,
planned_steps: list[Any], planned_steps: list[Any],
step_failure_stats: dict[str, _StepFailureStats], step_failure_stats: dict[str, _StepFailureStats],
actor: str = "",
) -> list[PitfallWarning]: ) -> list[PitfallWarning]:
"""将计划步骤与失败统计进行匹配,生成预警""" """将计划步骤与失败统计进行匹配,生成预警"""
warnings: list[PitfallWarning] = [] warnings: list[PitfallWarning] = []
@ -236,9 +251,7 @@ class PitfallDetector:
best_similarity = 0.0 best_similarity = 0.0
for stats_step_name, stats in step_failure_stats.items(): for stats_step_name, stats in step_failure_stats.items():
similarity = _compute_name_similarity( similarity = _compute_name_similarity(step_name, step_description, stats_step_name)
step_name, step_description, stats_step_name
)
if similarity > best_similarity: if similarity > best_similarity:
best_similarity = similarity best_similarity = similarity
best_match = stats best_match = stats
@ -254,18 +267,29 @@ class PitfallDetector:
else 0.0 else 0.0
) )
# U6/R5: compute confidence from failure_rate and sample size.
# ponytail: linear ramp to 3 samples; upgrade to Wilson interval
# if precision matters at low sample counts.
confidence = _compute_confidence(failure_rate, best_match.total_occurrences)
# 分配预警级别 # 分配预警级别
warning_level = _determine_warning_level(failure_rate) warning_level = _determine_warning_level(failure_rate)
# 生成建议 # 生成建议
suggestion = _build_suggestion(best_match, failure_rate) suggestion = _build_suggestion(best_match, failure_rate)
# U6/R5: low-confidence warnings are marked observe-only (not discarded)
observe_only = self._min_confidence > 0.0 and confidence < self._min_confidence
warning = PitfallWarning( warning = PitfallWarning(
step_name=step_name, step_name=step_name,
warning_level=warning_level, warning_level=warning_level,
failure_rate=round(failure_rate, 4), failure_rate=round(failure_rate, 4),
historical_failures=best_match.failure_reasons[:5], # 最多保留 5 条 historical_failures=best_match.failure_reasons[:5], # 最多保留 5 条
suggestion=suggestion, suggestion=suggestion,
confidence=round(confidence, 4),
actor=actor,
observe_only=observe_only,
) )
warnings.append(warning) warnings.append(warning)
@ -321,12 +345,48 @@ def _compute_name_similarity(
return len(intersection) / len(union) return len(intersection) / len(union)
_STOP_WORDS = frozenset({ _STOP_WORDS = frozenset(
"a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for", {
"of", "with", "by", "from", "is", "are", "was", "were", "be", "been", "a",
"being", "have", "has", "had", "do", "does", "did", "will", "would", "an",
"could", "should", "may", "might", "can", "shall", "not", "no", "the",
}) "and",
"or",
"but",
"in",
"on",
"at",
"to",
"for",
"of",
"with",
"by",
"from",
"is",
"are",
"was",
"were",
"be",
"been",
"being",
"have",
"has",
"had",
"do",
"does",
"did",
"will",
"would",
"could",
"should",
"may",
"might",
"can",
"shall",
"not",
"no",
}
)
def _extract_keywords(text: str) -> frozenset[str]: def _extract_keywords(text: str) -> frozenset[str]:
@ -337,10 +397,7 @@ def _extract_keywords(text: str) -> frozenset[str]:
# 统一分隔符 # 统一分隔符
normalized = text.lower().replace("_", " ").replace("-", " ") normalized = text.lower().replace("_", " ").replace("-", " ")
words = normalized.split() words = normalized.split()
return frozenset( return frozenset(w for w in words if len(w) > 1 and w not in _STOP_WORDS)
w for w in words
if len(w) > 1 and w not in _STOP_WORDS
)
def _determine_warning_level(failure_rate: float) -> WarningLevel: def _determine_warning_level(failure_rate: float) -> WarningLevel:
@ -357,6 +414,33 @@ def _determine_warning_level(failure_rate: float) -> WarningLevel:
return WarningLevel.LOW return WarningLevel.LOW
# U6/R5: minimum sample count for full confidence
_CONFIDENCE_FULL_SAMPLES = 3
def _compute_confidence(failure_rate: float, total_occurrences: int) -> float:
"""Compute confidence score for a pitfall warning.
Combines failure_rate with sample size: small samples reduce confidence
linearly. A warning based on 1 occurrence is low-confidence even if the
failure_rate is high; 3+ occurrences yield full confidence.
ponytail: linear ramp is a naive heuristic; upgrade path is a Wilson
score interval for statistically rigorous low-sample confidence bounds.
Args:
failure_rate: Historical failure rate (0.0 ~ 1.0).
total_occurrences: Total number of times this step was observed.
Returns:
Confidence score (0.0 ~ 1.0).
"""
if total_occurrences <= 0:
return 0.0
sample_factor = min(1.0, total_occurrences / _CONFIDENCE_FULL_SAMPLES)
return failure_rate * sample_factor
def _warning_level_order(level: WarningLevel) -> int: def _warning_level_order(level: WarningLevel) -> int:
"""预警级别排序值(越小越严重)""" """预警级别排序值(越小越严重)"""
return { return {

View File

@ -21,6 +21,7 @@ logger = logging.getLogger(__name__)
@dataclass @dataclass
class Signature: class Signature:
"""Prompt 签名 - 定义输入/输出字段""" """Prompt 签名 - 定义输入/输出字段"""
input_fields: dict[str, str] # name -> description input_fields: dict[str, str] # name -> description
output_fields: dict[str, str] # name -> description output_fields: dict[str, str] # name -> description
instruction: str = "" instruction: str = ""
@ -41,10 +42,13 @@ class Signature:
@dataclass @dataclass
class Module: class Module:
"""可组合的 Prompt 策略模块""" """可组合的 Prompt 策略模块"""
name: str name: str
signature: Signature signature: Signature
template: str = "" template: str = ""
demos: list[dict[str, Any]] = field(default_factory=list) demos: list[dict[str, Any]] = field(default_factory=list)
# U6/R6: actor marking — which agent/expert produced this optimized module
actor: str = ""
def render(self, **kwargs) -> str: def render(self, **kwargs) -> str:
parts = [] parts = []
@ -80,18 +84,42 @@ class BootstrapPromptOptimizer:
input_data: dict, input_data: dict,
output_data: dict, output_data: dict,
quality_score: float, quality_score: float,
actor: str = "",
) -> None: ) -> None:
"""添加训练样本""" """添加训练样本"""
example = { example = {
"input": input_data, "input": input_data,
"output": output_data, "output": output_data,
"quality_score": quality_score, "quality_score": quality_score,
"actor": actor,
} }
if quality_score >= 0.7: if quality_score >= 0.7:
self._success_examples.append(example) self._success_examples.append(example)
else: else:
self._failure_examples.append(example) self._failure_examples.append(example)
def can_optimize(self, min_confidence: float = 0.5, min_examples: int | None = None) -> bool:
"""U6/R5: consumption gate — sample count and confidence达标.
Returns True only when:
1. Success example count >= min_examples (default: constructor's
``min_examples_for_optimization``)
2. Mean quality score of success examples >= min_confidence
ponytail: mean-quality gate is redundant with the >= 0.7 success
threshold in add_example when min_confidence <= 0.7; upgrade path
is a diversity-weighted confidence metric if noise becomes an issue.
"""
threshold = min_examples if min_examples is not None else self._min_examples
if len(self._success_examples) < threshold:
return False
if not self._success_examples:
return False
mean_quality = sum(ex["quality_score"] for ex in self._success_examples) / len(
self._success_examples
)
return mean_quality >= min_confidence
async def optimize(self, module: Module) -> Module: async def optimize(self, module: Module) -> Module:
"""优化 Module 的 Prompt """优化 Module 的 Prompt
@ -110,15 +138,17 @@ class BootstrapPromptOptimizer:
key=lambda x: x["quality_score"], key=lambda x: x["quality_score"],
reverse=True, reverse=True,
) )
best_demos = sorted_examples[:self._max_demos] best_demos = sorted_examples[: self._max_demos]
# 构建 few-shot 示例 # 构建 few-shot 示例
demos = [] demos = []
for example in best_demos: for example in best_demos:
demos.append({ demos.append(
"input": str(example["input"]), {
"output": str(example["output"]), "input": str(example["input"]),
}) "output": str(example["output"]),
}
)
# 优化指令(基于失败案例的反面教材) # 优化指令(基于失败案例的反面教材)
optimized_instruction = module.signature.instruction optimized_instruction = module.signature.instruction
@ -127,9 +157,8 @@ class BootstrapPromptOptimizer:
for ex in self._failure_examples[-3:]: for ex in self._failure_examples[-3:]:
failure_patterns.add(str(ex["input"])[:100]) failure_patterns.add(str(ex["input"])[:100])
if failure_patterns: if failure_patterns:
optimized_instruction += ( optimized_instruction += "\n\nAvoid these patterns:\n" + "\n".join(
f"\n\nAvoid these patterns:\n" f"- {p}" for p in failure_patterns
+ "\n".join(f"- {p}" for p in failure_patterns)
) )
# 创建优化后的 Module # 创建优化后的 Module
@ -186,9 +215,16 @@ class LLMPromptOptimizer:
input_data: dict, input_data: dict,
output_data: dict, output_data: dict,
quality_score: float, quality_score: float,
actor: str = "",
) -> None: ) -> None:
"""添加训练样本(委托给 bootstrap 优化器)""" """添加训练样本(委托给 bootstrap 优化器)"""
self._bootstrap.add_example(input_data, output_data, quality_score) self._bootstrap.add_example(input_data, output_data, quality_score, actor=actor)
def can_optimize(self, min_confidence: float = 0.5, min_examples: int | None = None) -> bool:
"""U6/R5: consumption gate — delegates to bootstrap optimizer."""
return self._bootstrap.can_optimize(
min_confidence=min_confidence, min_examples=min_examples
)
async def optimize(self, module: Module, trace: Any = None, reflection: Any = None) -> Module: async def optimize(self, module: Module, trace: Any = None, reflection: Any = None) -> Module:
"""使用 LLM 优化 Module 的 Prompt """使用 LLM 优化 Module 的 Prompt

View File

@ -0,0 +1,879 @@
"""Tests for U6: auto evolution trigger + quality gate + actor marking.
Covers R5 (success sample rate, quality thresholds, observe-only) and
R6 (actor marking, cross-workspace sharing gate).
Test scenarios:
- Happy path (AE3): failure -> evolution fires (100%); success -> fires at 0.1 rate
- Observe-only mode: recorded but not fed to optimizer
- Backpressure cap reached: evolution task dropped + logged
- Low-confidence pitfall: marked observe-only
- Evolution task error: caught, does not fail the stream
- PromptOptimizer sample count < 3: skip optimization
- Actor marking present on all artifacts
- Cross-workspace sharing rejected without opt-in
- gave_up_after_reflections triggers failure-path evolution
"""
from __future__ import annotations
import asyncio
from datetime import datetime, timezone
from unittest.mock import patch
import pytest
from agentkit.core.protocol import TaskMessage, TaskResult, TaskStatus
from agentkit.evolution.config import EvolutionConfig
from agentkit.evolution.experience_schema import TaskExperience
from agentkit.evolution.experience_store import InMemoryExperienceStore
from agentkit.evolution.lifecycle import EvolutionMixin
from agentkit.evolution.pitfall_detector import (
PitfallDetector,
WarningLevel,
_compute_confidence,
)
from agentkit.evolution.prompt_optimizer import Module, PromptOptimizer, Signature
from agentkit.evolution.reflector import Reflection, Reflector
# ── Helpers ──────────────────────────────────────────────
def _make_task(
task_id: str = "test-001",
agent_name: str = "evolving_agent",
) -> TaskMessage:
return TaskMessage(
task_id=task_id,
agent_name=agent_name,
task_type="echo",
priority=0,
input_data={"query": "hello"},
callback_url=None,
created_at=datetime.now(timezone.utc),
)
def _make_result(
status: str = TaskStatus.COMPLETED,
output_data: dict | None = None,
error_message: str | None = None,
agent_name: str = "evolving_agent",
task_id: str = "test-001",
) -> TaskResult:
return TaskResult(
task_id=task_id,
agent_name=agent_name,
status=status,
output_data=output_data if output_data is not None else {"key": "value"},
error_message=error_message,
started_at=datetime.now(timezone.utc),
completed_at=datetime.now(timezone.utc),
metrics={"elapsed_seconds": 5.0},
)
def _make_failure_result(
agent_name: str = "evolving_agent",
task_id: str = "test-001",
) -> TaskResult:
return _make_result(
status=TaskStatus.FAILED,
output_data=None,
error_message="task failed",
agent_name=agent_name,
task_id=task_id,
)
def _make_module() -> Module:
return Module(
name="test_module",
signature=Signature(
input_fields={"query": "search query"},
output_fields={"result": "search result"},
instruction="Find the best result.",
),
)
class LowQualityReflector(Reflector):
"""Always produces failure outcome with improvement suggestions."""
async def reflect(self, task: TaskMessage, result: TaskResult) -> Reflection:
return Reflection(
task_id=task.task_id,
agent_name=result.agent_name,
outcome="failure",
quality_score=0.2,
patterns=["slow_execution"],
insights=["Low quality score indicates potential issues"],
suggestions=["Consider prompt optimization for this task type"],
)
class SuccessReflector(Reflector):
"""Always produces success outcome with suggestions (for testing success-path)."""
async def reflect(self, task: TaskMessage, result: TaskResult) -> Reflection:
return Reflection(
task_id=task.task_id,
agent_name=result.agent_name,
outcome="success",
quality_score=0.9,
patterns=["fast_execution"],
insights=["Good execution"],
suggestions=["Consider caching results for similar queries"],
)
class ErrorReflector(Reflector):
"""Always raises during reflection."""
async def reflect(self, task: TaskMessage, result: TaskResult) -> Reflection:
raise RuntimeError("reflector crashed")
def _make_experience(
task_type: str = "code_review",
outcome: str = "failure",
steps_summary: str | list = "",
success_rate: float = 0.0,
) -> TaskExperience:
return TaskExperience(
experience_id="",
task_type=task_type,
goal="test goal",
steps_summary=steps_summary,
outcome=outcome,
duration_seconds=10.0,
success_rate=success_rate,
failure_reasons=[],
optimization_tips=[],
created_at=datetime.now(timezone.utc),
)
# ── R5: Success sample rate gate ─────────────────────────
class TestSuccessSampleRate:
"""R5: success-path evolution gated by success_sample_rate; failure always runs."""
async def test_failure_always_triggers_evolution(self):
"""Failure path always triggers evolution regardless of sample rate."""
cfg = EvolutionConfig(success_sample_rate=0.0, observe_only=False)
reflector = LowQualityReflector()
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
mixin.set_current_module(_make_module())
task = _make_task()
result = _make_failure_result()
entry = await mixin.evolve_after_task(task, result)
assert entry.sampled is True
assert entry.reflection is not None
assert entry.reflection.outcome == "failure"
async def test_success_skipped_when_rate_zero(self):
"""Success path skipped when success_sample_rate=0.0."""
cfg = EvolutionConfig(success_sample_rate=0.0, observe_only=False)
reflector = SuccessReflector()
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
task = _make_task()
result = _make_result(status=TaskStatus.COMPLETED)
entry = await mixin.evolve_after_task(task, result)
assert entry.sampled is False
assert entry.reflection is None # evolution skipped before reflection
async def test_success_runs_when_rate_one(self):
"""Success path runs when success_sample_rate=1.0."""
cfg = EvolutionConfig(success_sample_rate=1.0, observe_only=False)
reflector = SuccessReflector()
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
task = _make_task()
result = _make_result(status=TaskStatus.COMPLETED)
entry = await mixin.evolve_after_task(task, result)
assert entry.sampled is True
assert entry.reflection is not None
assert entry.reflection.outcome == "success"
async def test_success_sampled_at_rate_boundary(self):
"""At rate=0.1, random < 0.1 runs; random >= 0.1 skips."""
cfg = EvolutionConfig(success_sample_rate=0.1, observe_only=False)
reflector = SuccessReflector()
# random < 0.1 -> evolution runs
mixin_run = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
with patch("agentkit.evolution.lifecycle.random.random", return_value=0.05):
entry = await mixin_run.evolve_after_task(
_make_task(), _make_result(status=TaskStatus.COMPLETED)
)
assert entry.sampled is True
assert entry.reflection is not None
# random >= 0.1 -> evolution skipped
mixin_skip = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
with patch("agentkit.evolution.lifecycle.random.random", return_value=0.15):
entry = await mixin_skip.evolve_after_task(
_make_task(), _make_result(status=TaskStatus.COMPLETED)
)
assert entry.sampled is False
assert entry.reflection is None
async def test_no_config_preserves_backward_compat(self):
"""Without auto_evolution_config, no sample gate applies (backward compat)."""
reflector = SuccessReflector()
mixin = EvolutionMixin(reflector=reflector)
task = _make_task()
result = _make_result(status=TaskStatus.COMPLETED)
entry = await mixin.evolve_after_task(task, result)
assert entry.sampled is True
assert entry.reflection is not None
# ── R5: Observe-only mode ────────────────────────────────
class TestObserveOnly:
"""R5: observe-only mode records but does not feed optimizer."""
async def test_observe_only_records_without_optimizing(self):
"""Observe-only: reflection recorded, optimizer not fed."""
cfg = EvolutionConfig(success_sample_rate=1.0, observe_only=True, min_confidence=0.0)
reflector = LowQualityReflector()
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1)
mixin = EvolutionMixin(
reflector=reflector,
prompt_optimizer=optimizer,
auto_evolution_config=cfg,
)
mixin.set_current_module(_make_module())
task = _make_task()
result = _make_failure_result()
entry = await mixin.evolve_after_task(task, result)
assert entry.observe_only is True
assert entry.reflection is not None
assert entry.optimized_module is None
# Optimizer should NOT have been fed
success_count, _ = optimizer.example_count
assert success_count == 0
async def test_observe_only_false_allows_optimization(self):
"""When observe_only=False, optimization can proceed (if gates pass)."""
cfg = EvolutionConfig(success_sample_rate=1.0, observe_only=False, min_confidence=0.0)
reflector = LowQualityReflector()
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1)
# Pre-fill enough success examples to pass consumption gate
for i in range(3):
optimizer.add_example(
input_data={"query": f"q_{i}"},
output_data={"result": f"r_{i}"},
quality_score=0.9,
)
mixin = EvolutionMixin(
reflector=reflector,
prompt_optimizer=optimizer,
auto_evolution_config=cfg,
)
mixin.set_current_module(_make_module())
task = _make_task()
result = _make_failure_result()
entry = await mixin.evolve_after_task(task, result)
assert entry.observe_only is False
assert entry.optimized_module is not None
# ── R5: PromptOptimizer consumption gate ─────────────────
class TestConsumptionGate:
"""R5: optimizer consumption gate — sample count >= min_examples AND confidence."""
async def test_sample_count_below_threshold_skips_optimization(self):
"""PromptOptimizer sample count < min_examples -> skip optimization."""
cfg = EvolutionConfig(
success_sample_rate=1.0,
observe_only=False,
min_examples=3,
min_confidence=0.0,
)
reflector = LowQualityReflector()
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=3)
# Only 2 success examples — below threshold
for i in range(2):
optimizer.add_example(
input_data={"query": f"q_{i}"},
output_data={"result": f"r_{i}"},
quality_score=0.9,
)
mixin = EvolutionMixin(
reflector=reflector,
prompt_optimizer=optimizer,
auto_evolution_config=cfg,
)
mixin.set_current_module(_make_module())
task = _make_task()
result = _make_failure_result()
entry = await mixin.evolve_after_task(task, result)
assert entry.optimized_module is None # gate not met
def test_can_optimize_returns_false_below_threshold(self):
"""can_optimize() returns False when sample count < min_examples."""
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=3)
assert optimizer.can_optimize(min_confidence=0.5) is False
def test_can_optimize_returns_true_above_threshold(self):
"""can_optimize() returns True when sample count and confidence met."""
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=3)
for i in range(3):
optimizer.add_example(
input_data={"query": f"q_{i}"},
output_data={"result": f"r_{i}"},
quality_score=0.9,
)
assert optimizer.can_optimize(min_confidence=0.5) is True
def test_can_optimize_returns_false_low_confidence(self):
"""can_optimize() returns False when mean quality < min_confidence."""
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=3)
for i in range(3):
optimizer.add_example(
input_data={"query": f"q_{i}"},
output_data={"result": f"r_{i}"},
quality_score=0.3, # below 0.5 threshold
)
# These go to failure_examples (quality < 0.7), so success_examples is empty
assert optimizer.can_optimize(min_confidence=0.5) is False
# ── R5: Pitfall confidence threshold ─────────────────────
class TestPitfallConfidence:
"""R5: low-confidence pitfalls marked observe-only."""
def test_compute_confidence_high_sample_high_rate(self):
"""3+ occurrences with high failure_rate -> high confidence."""
conf = _compute_confidence(failure_rate=0.6, total_occurrences=5)
assert conf == pytest.approx(0.6)
def test_compute_confidence_low_sample(self):
"""1 occurrence -> confidence scaled down by 1/3."""
conf = _compute_confidence(failure_rate=0.6, total_occurrences=1)
assert conf == pytest.approx(0.6 * (1.0 / 3.0))
def test_compute_confidence_zero_samples(self):
"""0 occurrences -> zero confidence."""
assert _compute_confidence(failure_rate=0.5, total_occurrences=0) == 0.0
async def test_low_confidence_pitfall_marked_observe_only(self):
"""Pitfall with confidence < min_confidence is marked observe-only."""
store = InMemoryExperienceStore(decay_rate=0.01, alpha=0.7)
# Only 1 failure experience -> low sample -> low confidence
await store.record_experience(
_make_experience(
task_type="testing",
outcome="failure",
steps_summary=[
{"step_name": "Run Tests", "outcome": "failure", "error": "Flaky"},
],
)
)
detector = PitfallDetector(
experience_store=store,
similarity_threshold=0.3,
min_confidence=0.5,
)
from agentkit.core.plan_schema import PlanStep, PlanStepStatus
steps = [
PlanStep(
step_id="s1",
name="Run Tests",
description="Run tests",
status=PlanStepStatus.PENDING,
)
]
warnings = await detector.check_pitfalls(
task_type="testing", planned_steps=steps, actor="test_agent"
)
assert len(warnings) == 1
assert warnings[0].observe_only is True
assert warnings[0].confidence < 0.5
assert warnings[0].actor == "test_agent"
async def test_high_confidence_pitfall_not_observe_only(self):
"""Pitfall with confidence >= min_confidence is not observe-only."""
store = InMemoryExperienceStore(decay_rate=0.01, alpha=0.7)
# 3+ failure experiences -> full sample factor -> high confidence
for _ in range(4):
await store.record_experience(
_make_experience(
task_type="deployment",
outcome="failure",
steps_summary=[
{"step_name": "Deploy", "outcome": "failure", "error": "OOM"},
],
)
)
detector = PitfallDetector(
experience_store=store,
similarity_threshold=0.3,
min_confidence=0.5,
)
from agentkit.core.plan_schema import PlanStep, PlanStepStatus
steps = [
PlanStep(
step_id="s1", name="Deploy", description="Deploy app", status=PlanStepStatus.PENDING
)
]
warnings = await detector.check_pitfalls(task_type="deployment", planned_steps=steps)
assert len(warnings) == 1
assert warnings[0].observe_only is False
assert warnings[0].confidence >= 0.5
# ── R6: Actor marking ────────────────────────────────────
class TestActorMarking:
"""R6: actor marking on all evolution artifacts."""
async def test_log_entry_carries_actor(self):
"""EvolutionLogEntry carries the actor identity."""
cfg = EvolutionConfig(success_sample_rate=1.0, observe_only=False)
reflector = LowQualityReflector()
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
task = _make_task(agent_name="backend_engineer")
result = _make_failure_result(agent_name="backend_engineer")
entry = await mixin.evolve_after_task(task, result, actor="backend_engineer")
assert entry.actor == "backend_engineer"
async def test_actor_defaults_to_result_agent_name(self):
"""Actor defaults to result.agent_name when not explicitly provided."""
cfg = EvolutionConfig(success_sample_rate=1.0, observe_only=True)
reflector = LowQualityReflector()
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
task = _make_task(agent_name="qa_engineer")
result = _make_failure_result(agent_name="qa_engineer")
entry = await mixin.evolve_after_task(task, result)
assert entry.actor == "qa_engineer"
async def test_actor_marked_on_optimized_module(self):
"""Optimized Module carries the actor identity."""
cfg = EvolutionConfig(success_sample_rate=1.0, observe_only=False, min_confidence=0.0)
reflector = LowQualityReflector()
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1)
for i in range(3):
optimizer.add_example(
input_data={"query": f"q_{i}"},
output_data={"result": f"r_{i}"},
quality_score=0.9,
)
mixin = EvolutionMixin(
reflector=reflector,
prompt_optimizer=optimizer,
auto_evolution_config=cfg,
)
mixin.set_current_module(_make_module())
task = _make_task(agent_name="tech_lead")
result = _make_failure_result(agent_name="tech_lead")
entry = await mixin.evolve_after_task(task, result, actor="tech_lead")
assert entry.optimized_module is not None
assert entry.optimized_module.actor == "tech_lead"
async def test_actor_in_history(self):
"""get_evolution_history includes actor field."""
cfg = EvolutionConfig(success_sample_rate=1.0, observe_only=True)
reflector = LowQualityReflector()
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
await mixin.evolve_after_task(
_make_task(), _make_failure_result(), actor="frontend_engineer"
)
history = mixin.get_evolution_history()
assert len(history) == 1
assert history[0]["actor"] == "frontend_engineer"
async def test_pitfall_warning_carries_actor(self):
"""PitfallWarning carries the actor identity."""
store = InMemoryExperienceStore(decay_rate=0.01, alpha=0.7)
await store.record_experience(
_make_experience(
task_type="testing",
outcome="failure",
steps_summary=[
{"step_name": "Run Tests", "outcome": "failure", "error": "Error"},
],
)
)
detector = PitfallDetector(experience_store=store, similarity_threshold=0.3)
from agentkit.core.plan_schema import PlanStep, PlanStepStatus
steps = [
PlanStep(
step_id="s1",
name="Run Tests",
description="Run tests",
status=PlanStepStatus.PENDING,
)
]
warnings = await detector.check_pitfalls(
task_type="testing", planned_steps=steps, actor="code_reviewer"
)
assert len(warnings) == 1
assert warnings[0].actor == "code_reviewer"
# ── R6: Cross-workspace sharing ──────────────────────────
class TestCrossWorkspaceSharing:
"""R6: cross-workspace sharing defaults off; same-workspace always on."""
def test_same_workspace_sharing_always_allowed(self):
"""Same-actor sharing is always allowed."""
mixin = EvolutionMixin(reflector=Reflector())
assert mixin.can_share_artifact("agent_a", "agent_a") is True
def test_cross_workspace_sharing_default_off(self):
"""Cross-workspace sharing rejected without opt-in (default)."""
cfg = EvolutionConfig(cross_workspace_sharing=False)
mixin = EvolutionMixin(reflector=Reflector(), auto_evolution_config=cfg)
assert mixin.can_share_artifact("agent_a", "agent_b") is False
def test_cross_workspace_sharing_with_opt_in(self):
"""Cross-workspace sharing allowed when explicitly opted in."""
cfg = EvolutionConfig(cross_workspace_sharing=True)
mixin = EvolutionMixin(reflector=Reflector(), auto_evolution_config=cfg)
assert mixin.can_share_artifact("agent_a", "agent_b") is True
def test_no_config_cross_workspace_rejected(self):
"""Without config, cross-workspace sharing is rejected (safe default)."""
mixin = EvolutionMixin(reflector=Reflector())
assert mixin.can_share_artifact("agent_a", "agent_b") is False
# ── KTD-8: gave_up_after_reflections ─────────────────────
class TestGaveUpAfterReflections:
"""KTD-8: gave_up_after_reflections triggers failure-path evolution."""
async def test_gave_up_treated_as_failure(self):
"""gave_up_after_reflections in output_data triggers failure path."""
cfg = EvolutionConfig(success_sample_rate=0.0, observe_only=True)
reflector = LowQualityReflector()
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
task = _make_task()
# status=COMPLETED but trace_outcome=gave_up_after_reflections
result = _make_result(
status=TaskStatus.COMPLETED,
output_data={"trace_outcome": "gave_up_after_reflections"},
)
entry = await mixin.evolve_after_task(task, result)
# Even though success_sample_rate=0.0, failure path always runs
assert entry.sampled is True
assert entry.reflection is not None
async def test_gave_up_in_error_message_treated_as_failure(self):
"""gave_up_after_reflections in error_message triggers failure path."""
cfg = EvolutionConfig(success_sample_rate=0.0, observe_only=True)
reflector = LowQualityReflector()
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
task = _make_task()
result = _make_result(
status=TaskStatus.COMPLETED,
output_data={"content": "some output"},
error_message="gave_up_after_reflections: exhausted reinjections",
)
entry = await mixin.evolve_after_task(task, result)
assert entry.sampled is True
assert entry.reflection is not None
def test_is_failure_path_normal_success(self):
"""Normal success (COMPLETED, no gave_up signal) is not failure path."""
mixin = EvolutionMixin(reflector=Reflector())
result = _make_result(status=TaskStatus.COMPLETED, output_data={"key": "val"})
assert mixin._is_failure_path(result) is False
def test_is_failure_path_failed_status(self):
"""FAILED status is failure path."""
mixin = EvolutionMixin(reflector=Reflector())
result = _make_result(status=TaskStatus.FAILED, output_data=None)
assert mixin._is_failure_path(result) is True
def test_is_failure_path_cancelled_status(self):
"""CANCELLED status is failure path."""
mixin = EvolutionMixin(reflector=Reflector())
result = _make_result(status=TaskStatus.CANCELLED, output_data=None)
assert mixin._is_failure_path(result) is True
# ── Error handling: evolution does not fail the stream ───
class TestEvolutionErrorHandling:
"""Evolution task error is caught and does not propagate to the caller.
The _evolve_safe wrapper in config_driven.py catches all exceptions from
evolve_after_task. These tests verify that pattern.
"""
async def test_evolve_safe_swallows_reflector_error(self):
"""_evolve_safe pattern: reflector error is caught, not propagated."""
class SafeWrapper(EvolutionMixin):
"""Simulates the _evolve_safe pattern from ConfigDrivenAgent."""
async def _evolve_safe(self, task: TaskMessage, result: TaskResult) -> None:
try:
await self.evolve_after_task(task, result)
except Exception:
pass # swallowed, matching config_driven.py:_evolve_safe
mixin = SafeWrapper(reflector=ErrorReflector())
# Should not raise
await mixin._evolve_safe(_make_task(), _make_failure_result())
async def test_apply_change_error_does_not_crash_evolution(self):
"""_apply_change errors are caught internally (existing behavior)."""
cfg = EvolutionConfig(success_sample_rate=1.0, observe_only=False, min_confidence=0.0)
reflector = LowQualityReflector()
optimizer = PromptOptimizer(max_demos=3, min_examples_for_optimization=1)
for i in range(3):
optimizer.add_example(
input_data={"query": f"q_{i}"},
output_data={"result": f"r_{i}"},
quality_score=0.9,
)
mixin = EvolutionMixin(
reflector=reflector,
prompt_optimizer=optimizer,
auto_evolution_config=cfg,
)
mixin.set_current_module(_make_module())
# Should complete without raising even if internal steps have issues
entry = await mixin.evolve_after_task(_make_task(), _make_failure_result())
assert entry is not None
# ── Integration: fire-and-forget via asyncio.create_task ─
class TestFireAndForgetIntegration:
"""Evolution fires via U2's execute_stream hooks (fire-and-forget pattern).
Validates that evolve_after_task works correctly when scheduled as a
fire-and-forget asyncio task, matching _trigger_evolution_hooks behavior.
"""
async def test_evolve_after_task_completes_as_asyncio_task(self):
"""evolve_after_task completes when scheduled via asyncio.create_task."""
cfg = EvolutionConfig(success_sample_rate=1.0, observe_only=True)
reflector = LowQualityReflector()
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
task = _make_task()
result = _make_failure_result()
# Schedule as fire-and-forget task (mirrors _schedule_evolution)
async def _evolve():
await mixin.evolve_after_task(task, result)
t = asyncio.create_task(_evolve())
await t # wait for completion
history = mixin.get_evolution_history()
assert len(history) == 1
assert history[0]["reflection"] is not None
async def test_concurrent_evolution_tasks_isolated(self):
"""Multiple concurrent evolution tasks don't interfere."""
cfg = EvolutionConfig(success_sample_rate=1.0, observe_only=True)
reflector = LowQualityReflector()
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
async def _run_one(task_id: str):
await mixin.evolve_after_task(
_make_task(task_id=task_id),
_make_failure_result(task_id=task_id),
)
await asyncio.gather(
_run_one("task-a"),
_run_one("task-b"),
_run_one("task-c"),
)
history = mixin.get_evolution_history()
assert len(history) == 3
task_ids = {h["task_id"] for h in history}
assert task_ids == {"task-a", "task-b", "task-c"}
# ── Backpressure cap (U2 _schedule_evolution) ────────────
class TestBackpressureCap:
"""Backpressure cap reached -> evolution task dropped + logged.
Tests U2's _schedule_evolution backpressure, which U6's auto-trigger relies on.
"""
async def test_evolution_task_dropped_when_cap_reached(self):
"""When pending tasks reach cap, new evolution tasks are dropped."""
import agentkit.core.config_driven as cd
# Save original state to restore after test
try:
# Create blocking coroutines that won't complete during the test
block_event = asyncio.Event()
async def _blocking_evolve() -> None:
await block_event.wait()
cap = 4
# Fill up to cap
for _ in range(cap):
cd._schedule_evolution(_blocking_evolve(), cap=cap)
assert len(cd._pending_evolution_tasks) == cap
# Track dropped count before (access via module — int is immutable)
dropped_before = cd._evolution_dropped_count
# Try to schedule one more -> should be dropped
cd._schedule_evolution(_blocking_evolve(), cap=cap)
assert len(cd._pending_evolution_tasks) == cap # still at cap
assert cd._evolution_dropped_count == dropped_before + 1
# Release the blocking tasks so they can complete and be cleaned up
block_event.set()
# Let the event loop process task completions
await asyncio.sleep(0.05)
finally:
# Restore: clean up any remaining tasks
block_event = asyncio.Event()
block_event.set()
# Wait for any stragglers
if cd._pending_evolution_tasks:
await asyncio.gather(*cd._pending_evolution_tasks, return_exceptions=True)
cd._pending_evolution_tasks.clear()
# ── AE3: Happy path — pitfall detection ──────────────────
class TestAE3HappyPath:
"""AE3: task fails -> evolution fires (100%) -> Reflector records ->
PitfallDetector detects; task succeeds -> evolution fires at 0.1 rate.
"""
async def test_failure_triggers_evolution_and_pitfall_detection(self):
"""Full happy path: failure -> evolution -> pitfall detection."""
# 1. Evolution fires on failure (100%)
cfg = EvolutionConfig(success_sample_rate=0.0, observe_only=True)
reflector = LowQualityReflector()
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
task = _make_task()
result = _make_failure_result()
entry = await mixin.evolve_after_task(task, result)
assert entry.reflection is not None
assert entry.reflection.outcome == "failure"
# 2. PitfallDetector detects high-failure-rate step
store = InMemoryExperienceStore(decay_rate=0.01, alpha=0.7)
for _ in range(6):
await store.record_experience(
_make_experience(
task_type="order_processing",
outcome="failure",
steps_summary=[
{"step_name": "Call API", "outcome": "failure", "error": "timeout"},
],
)
)
for _ in range(4):
await store.record_experience(
_make_experience(
task_type="order_processing",
outcome="success",
success_rate=1.0,
steps_summary=[
{"step_name": "Call API", "outcome": "success"},
],
)
)
detector = PitfallDetector(experience_store=store, similarity_threshold=0.3)
from agentkit.core.plan_schema import PlanStep, PlanStepStatus
steps = [
PlanStep(
step_id="s1",
name="Call API",
description="Call external API",
status=PlanStepStatus.PENDING,
)
]
warnings = await detector.check_pitfalls(task_type="order_processing", planned_steps=steps)
assert len(warnings) == 1
assert warnings[0].warning_level == WarningLevel.HIGH
assert warnings[0].failure_rate >= 0.5
async def test_success_sampled_at_0_1_rate(self):
"""Success path: with rate=0.1, ~10% of tasks trigger evolution."""
cfg = EvolutionConfig(success_sample_rate=0.1, observe_only=True)
reflector = SuccessReflector()
triggered = 0
total = 100
for _ in range(total):
mixin = EvolutionMixin(reflector=reflector, auto_evolution_config=cfg)
entry = await mixin.evolve_after_task(
_make_task(), _make_result(status=TaskStatus.COMPLETED)
)
if entry.reflection is not None:
triggered += 1
# With rate=0.1 over 100 trials, expect ~10 (allow wide tolerance)
# ponytail: statistical test; flaky at extreme bounds. Upgrade to
# deterministic mock if CI reliability becomes an issue.
assert 1 <= triggered <= 25