feat(phase2): implement self-evolution and smart terminal (U6-U8)

- U6: PitfallDetector - detect historical failure patterns and warn - U7: PathOptimizer - discover and update optimal execution paths - U8: TerminalSession - session state, PTY interactive, output parsing 160 new tests passing. ShellTool enhanced with session_id support.
2026-06-10 00:22:36 +08:00 · 2026-06-10 00:22:36 +08:00 · e3d4f811dd
parent fd4a811929
commit e3d4f811dd
11 changed files with 4001 additions and 0 deletions
--- a/src/agentkit/evolution/path_optimizer.py
+++ b/src/agentkit/evolution/path_optimizer.py
@ -0,0 +1,259 @@
 """PathOptimizer - 执行路径优化器
 发现更优执行路径时自动更新经验库中的推荐路径。
 核心逻辑：
 1. 对比新路径与现有最优路径（综合耗时和成功率）
 2. 新路径成功率更高 → 更新推荐路径
 3. 成功率相近但耗时更短 → 更新推荐路径
 4. 样本量不足 → 不更新，记录待观察
 """
 from __future__ import annotations
 import logging
 import uuid
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
    from agentkit.evolution.experience_store import InMemoryExperienceStore
 logger = logging.getLogger(__name__)
@dataclass
 class ExecutionPath:
    """执行路径数据模型
    记录特定任务类型的执行路径信息，用于路径优化比较。
    Attributes:
        path_id: 路径唯一标识
        task_type: 任务类型
        steps: 执行步骤名称列表
        total_duration: 总耗时（秒）
        success_rate: 成功率（0.0 ~ 1.0）
        sample_count: 样本数量
        is_recommended: 是否为当前推荐路径
        created_at: 创建时间
    """
    path_id: str = ""
    task_type: str = ""
    steps: list[str] = field(default_factory=list)
    total_duration: float = 0.0
    success_rate: float = 0.0
    sample_count: int = 0
    is_recommended: bool = False
    created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
@dataclass
 class PathUpdateResult:
    """路径更新结果
    Attributes:
        updated: 是否更新了推荐路径
        old_path: 更新前的推荐路径（未更新时为 None）
        new_path: 更新后的推荐路径（未更新时为 None）
        reason: 更新/未更新的原因说明
    """
    updated: bool = False
    old_path: ExecutionPath | None = None
    new_path: ExecutionPath | None = None
    reason: str = ""
 class PathOptimizer:
    """执行路径优化器
    对比新路径与现有最优路径，决定是否更新推荐路径。
    可独立使用，也可集成到 PlanChecker 的复盘中。
    更新策略：
    1. 新路径成功率 > 现有成功率 + success_rate_threshold → 更新
    2. 成功率相近（差值 ≤ threshold）但耗时显著更短
       （duration 改善比例 > duration_improvement_threshold）→ 更新
    3. 样本量不足（< min_sample_count）→ 不更新
    4. 其他情况 → 保留现有推荐路径
    """
    def __init__(
        self,
        experience_store: InMemoryExperienceStore | None = None,
        min_sample_count: int = 3,
        success_rate_threshold: float = 0.05,
        duration_improvement_threshold: float = 0.2,
    ):
        """初始化 PathOptimizer
        Args:
            experience_store: 经验存储实例（可选）
            min_sample_count: 最小样本量，低于此值不做决策
            success_rate_threshold: 成功率提升阈值，超过此值视为显著提升
            duration_improvement_threshold: 耗时改善比例阈值，超过此值视为显著改善
        """
        self._experience_store = experience_store
        self._min_sample_count = min_sample_count
        self._success_rate_threshold = success_rate_threshold
        self._duration_improvement_threshold = duration_improvement_threshold
        self._recommended_paths: dict[str, ExecutionPath] = {}
        self._pending_paths: dict[str, list[ExecutionPath]] = {}
    def get_recommended_path(self, task_type: str) -> ExecutionPath | None:
        """获取指定任务类型的当前推荐路径
        Args:
            task_type: 任务类型
        Returns:
            推荐路径，若无则返回 None
        """
        return self._recommended_paths.get(task_type)
    async def evaluate_and_update(
        self,
        task_type: str,
        new_path: ExecutionPath,
    ) -> PathUpdateResult:
        """评估新路径并决定是否更新推荐路径
        Args:
            task_type: 任务类型
            new_path: 新的执行路径
        Returns:
            路径更新结果
        """
        # 确保新路径有 path_id
        if not new_path.path_id:
            new_path.path_id = str(uuid.uuid4())
        new_path.task_type = task_type
        # 样本量不足 → 不更新，记录待观察
        if new_path.sample_count < self._min_sample_count:
            self._pending_paths.setdefault(task_type, []).append(new_path)
            reason = (
                f"样本量不足（{new_path.sample_count} < {self._min_sample_count}），"
                f"记录待观察"
            )
            logger.info(
                f"Path not updated for '{task_type}': {reason}"
            )
            return PathUpdateResult(
                updated=False,
                old_path=None,
                new_path=new_path,
                reason=reason,
            )
        current = self._recommended_paths.get(task_type)
        # 无现有推荐路径 → 直接设为推荐
        if current is None:
            new_path.is_recommended = True
            self._recommended_paths[task_type] = new_path
            reason = "无现有推荐路径，直接设为推荐"
            logger.info(f"Path set as recommended for '{task_type}': {reason}")
            return PathUpdateResult(
                updated=True,
                old_path=None,
                new_path=new_path,
                reason=reason,
            )
        # 比较新路径与现有推荐路径
        return self._compare_and_decide(task_type, current, new_path)
    def _compare_and_decide(
        self,
        task_type: str,
        current: ExecutionPath,
        new: ExecutionPath,
    ) -> PathUpdateResult:
        """比较新旧路径并决策
        比较逻辑：
        1. 新路径成功率 > 现有成功率 + threshold → 更新
        2. 成功率相近（差值 ≤ threshold）且新耗时显著更短 → 更新
        3. 其他 → 保留现有
        """
        sr_diff = new.success_rate - current.success_rate
        # 条件 1：成功率显著提升
        if sr_diff > self._success_rate_threshold:
            return self._apply_update(
                task_type, current, new,
                f"成功率显著提升（{new.success_rate:.2f} > {current.success_rate:.2f}，"
                f"提升 {sr_diff:.2f}）",
            )
        # 条件 2：成功率相近但耗时显著更短
        if abs(sr_diff) <= self._success_rate_threshold:
            if current.total_duration > 0:
                duration_improvement = (
                    (current.total_duration - new.total_duration) / current.total_duration
                )
                if (
                    new.total_duration < current.total_duration
                    and duration_improvement > self._duration_improvement_threshold
                ):
                    return self._apply_update(
                        task_type, current, new,
                        f"成功率相近（{new.success_rate:.2f} vs {current.success_rate:.2f}），"
                        f"耗时显著更短（{new.total_duration:.1f}s vs {current.total_duration:.1f}s，"
                        f"改善 {duration_improvement:.1%}）",
                    )
            elif current.total_duration == 0 and new.total_duration > 0:
                # 现有路径耗时为 0（不太可能），不更新
                pass
            elif current.total_duration == 0 and new.total_duration == 0:
                # 两者耗时均为 0，不更新
                pass
        # 条件 3：无明显优势 → 保留现有
        reason = (
            f"新路径无明显优势（成功率 {new.success_rate:.2f} vs {current.success_rate:.2f}，"
            f"耗时 {new.total_duration:.1f}s vs {current.total_duration:.1f}s），保留现有推荐路径"
        )
        logger.info(f"Path not updated for '{task_type}': {reason}")
        return PathUpdateResult(
            updated=False,
            old_path=current,
            new_path=new,
            reason=reason,
        )
    def _apply_update(
        self,
        task_type: str,
        old: ExecutionPath,
        new: ExecutionPath,
        reason: str,
    ) -> PathUpdateResult:
        """应用路径更新"""
        old.is_recommended = False
        new.is_recommended = True
        self._recommended_paths[task_type] = new
        logger.info(f"Path updated for '{task_type}': {reason}")
        return PathUpdateResult(
            updated=True,
            old_path=old,
            new_path=new,
            reason=reason,
        )
    def get_pending_paths(self, task_type: str) -> list[ExecutionPath]:
        """获取指定任务类型的待观察路径
        Args:
            task_type: 任务类型
        Returns:
            待观察路径列表
        """
        return list(self._pending_paths.get(task_type, []))
--- a/src/agentkit/evolution/pitfall_detector.py
+++ b/src/agentkit/evolution/pitfall_detector.py
@ -0,0 +1,388 @@
 """PitfallDetector - 任务避坑预警
 新任务启动时检索历史失败经验，匹配当前计划步骤，自动预警。
 基于 ExperienceStore 中存储的失败经验，将失败步骤与当前计划步骤
 进行关键词匹配，计算失败率并按严重程度返回预警列表。
 """
 from __future__ import annotations
 import logging
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any, Protocol
 logger = logging.getLogger(__name__)
 class WarningLevel(str, Enum):
    """预警级别"""
    HIGH = "high"
    MEDIUM = "medium"
    LOW = "low"
@dataclass
 class PitfallWarning:
    """避坑预警
    Attributes:
        step_name: 计划步骤名称
        warning_level: 预警级别（HIGH/MEDIUM/LOW）
        failure_rate: 历史失败率（0.0 ~ 1.0）
        historical_failures: 历史失败原因列表
        suggestion: 优化建议
    """
    step_name: str
    warning_level: WarningLevel
    failure_rate: float
    historical_failures: list[str] = field(default_factory=list)
    suggestion: str = ""
 class ExperienceStoreProtocol(Protocol):
    """ExperienceStore 协议接口，用于类型标注"""
    async def search(
        self,
        query: str,
        top_k: int = 5,
        task_type: str | None = None,
        search_multiplier: int = 5,
    ) -> list[Any]:
        ...
 # 预警级别阈值
 _HIGH_THRESHOLD = 0.5
 _MEDIUM_THRESHOLD = 0.2
 class PitfallDetector:
    """避坑检测器
    新任务启动时检索历史失败经验，匹配当前计划步骤，自动预警。
    使用方式：
        detector = PitfallDetector(experience_store)
        warnings = await detector.check_pitfalls(
            task_type="code_review",
            planned_steps=[plan_step1, plan_step2, ...],
        )
    匹配逻辑：
    1. 检索同类任务的失败经验
    2. 从失败经验中提取失败步骤
    3. 将失败步骤与当前计划步骤进行关键词匹配
    4. 计算失败率并分配预警级别
    预警级别：
    - HIGH: failure_rate >= 0.5（历史高失败率步骤）
    - MEDIUM: failure_rate >= 0.2（有失败记录但频率低）
    - LOW: 有任何失败记录
    """
    def __init__(
        self,
        experience_store: ExperienceStoreProtocol,
        similarity_threshold: float = 0.3,
        max_search_results: int = 50,
    ):
        """
        Args:
            experience_store: 经验存储实例（ExperienceStore 或 InMemoryExperienceStore）
            similarity_threshold: 步骤名称关键词匹配的最小相似度阈值
            max_search_results: 从经验存储检索的最大结果数
        """
        self._store = experience_store
        self._similarity_threshold = similarity_threshold
        self._max_search_results = max_search_results
    async def check_pitfalls(
        self,
        task_type: str,
        planned_steps: list[Any],
    ) -> list[PitfallWarning]:
        """检查计划步骤中的潜在陷阱
        Args:
            task_type: 任务类型
            planned_steps: 计划步骤列表（PlanStep 对象或具有 name/description 属性的对象）
        Returns:
            按严重程度排序的预警列表（HIGH → MEDIUM → LOW）
        """
        if not planned_steps:
            return []
        # 1. 检索同类任务的所有经验（包含成功和失败，用于计算步骤级失败率）
        all_experiences = await self._search_experiences(task_type)
        if not all_experiences:
            logger.debug(f"No experiences found for task_type={task_type}")
            return []
        # 2. 从经验中提取步骤级别的失败统计
        step_failure_stats = self._extract_step_failure_stats(all_experiences)
        # 3. 匹配当前计划步骤并生成预警
        warnings = self._match_and_warn(planned_steps, step_failure_stats)
        # 4. 按严重程度排序（HIGH → MEDIUM → LOW），同级别按失败率降序
        warnings.sort(key=lambda w: (_warning_level_order(w.warning_level), -w.failure_rate))
        if warnings:
            logger.info(
                f"PitfallDetector found {len(warnings)} warnings for task_type={task_type}: "
                f"{sum(1 for w in warnings if w.warning_level == WarningLevel.HIGH)} HIGH, "
                f"{sum(1 for w in warnings if w.warning_level == WarningLevel.MEDIUM)} MEDIUM, "
                f"{sum(1 for w in warnings if w.warning_level == WarningLevel.LOW)} LOW"
            )
        return warnings
    async def _search_experiences(self, task_type: str) -> list[Any]:
        """检索指定任务类型的所有经验（包含成功和失败）"""
        try:
            results = await self._store.search(
                query=task_type,
                top_k=self._max_search_results,
                task_type=task_type,
            )
            return results
        except Exception as e:
            logger.error(f"Failed to search experiences for pitfall detection: {e}")
            return []
    def _extract_step_failure_stats(
        self, failed_experiences: list[Any]
    ) -> dict[str, _StepFailureStats]:
        """从失败经验中提取步骤级别的失败统计
        steps_summary 可以是 str 或 list[dict]：
        - list[dict]: 每个字典包含 step_name, outcome, duration_seconds, error
        - str: 退化为整体统计
        Returns:
            以步骤名称为 key 的失败统计字典
        """
        stats: dict[str, _StepFailureStats] = {}
        for exp in failed_experiences:
            steps_summary = exp.steps_summary
            # 如果 steps_summary 是字符串，无法提取步骤级信息
            if isinstance(steps_summary, str):
                continue
            if not isinstance(steps_summary, list):
                continue
            for step in steps_summary:
                if not isinstance(step, dict):
                    continue
                step_name = step.get("step_name", "")
                if not step_name:
                    continue
                outcome = step.get("outcome", "")
                error = step.get("error", "")
                if step_name not in stats:
                    stats[step_name] = _StepFailureStats(
                        step_name=step_name,
                        total_occurrences=0,
                        failure_occurrences=0,
                        failure_reasons=[],
                        optimization_tips=[],
                    )
                s = stats[step_name]
                s.total_occurrences += 1
                if outcome in ("failure", "failed", "error"):
                    s.failure_occurrences += 1
                    if error:
                        s.failure_reasons.append(error)
            # 收集优化建议
            if hasattr(exp, "optimization_tips") and exp.optimization_tips:
                for step_name, s in stats.items():
                    s.optimization_tips.extend(exp.optimization_tips)
        return stats
    def _match_and_warn(
        self,
        planned_steps: list[Any],
        step_failure_stats: dict[str, _StepFailureStats],
    ) -> list[PitfallWarning]:
        """将计划步骤与失败统计进行匹配，生成预警"""
        warnings: list[PitfallWarning] = []
        for step in planned_steps:
            step_name = getattr(step, "name", "")
            step_description = getattr(step, "description", "")
            if not step_name:
                continue
            # 查找最佳匹配的失败步骤
            best_match: _StepFailureStats | None = None
            best_similarity = 0.0
            for stats_step_name, stats in step_failure_stats.items():
                similarity = _compute_name_similarity(
                    step_name, step_description, stats_step_name
                )
                if similarity > best_similarity:
                    best_similarity = similarity
                    best_match = stats
            # 相似度低于阈值，跳过
            if best_match is None or best_similarity < self._similarity_threshold:
                continue
            # 计算失败率
            failure_rate = (
                best_match.failure_occurrences / best_match.total_occurrences
                if best_match.total_occurrences > 0
                else 0.0
            )
            # 分配预警级别
            warning_level = _determine_warning_level(failure_rate)
            # 生成建议
            suggestion = _build_suggestion(best_match, failure_rate)
            warning = PitfallWarning(
                step_name=step_name,
                warning_level=warning_level,
                failure_rate=round(failure_rate, 4),
                historical_failures=best_match.failure_reasons[:5],  # 最多保留 5 条
                suggestion=suggestion,
            )
            warnings.append(warning)
        return warnings
 # ── 内部辅助类 ──────────────────────────────────────────────
@dataclass
 class _StepFailureStats:
    """步骤级别的失败统计（内部使用）"""
    step_name: str
    total_occurrences: int
    failure_occurrences: int
    failure_reasons: list[str]
    optimization_tips: list[str]
 # ── 辅助函数 ──────────────────────────────────────────────
 def _compute_name_similarity(
    step_name: str, step_description: str, historical_step_name: str
 ) -> float:
    """计算步骤名称的关键词重叠相似度
    基于关键词集合的 Jaccard 相似度，同时考虑 step_name 和 step_description。
    Args:
        step_name: 当前计划步骤名称
        step_description: 当前计划步骤描述
        historical_step_name: 历史步骤名称
    Returns:
        相似度分数（0.0 ~ 1.0）
    """
    # 提取关键词：将名称拆分为词，过滤掉常见停用词
    current_keywords = _extract_keywords(f"{step_name} {step_description}")
    historical_keywords = _extract_keywords(historical_step_name)
    if not current_keywords or not historical_keywords:
        return 0.0
    # Jaccard 相似度
    intersection = current_keywords & historical_keywords
    union = current_keywords | historical_keywords
    if not union:
        return 0.0
    return len(intersection) / len(union)
 _STOP_WORDS = frozenset({
    "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for",
    "of", "with", "by", "from", "is", "are", "was", "were", "be", "been",
    "being", "have", "has", "had", "do", "does", "did", "will", "would",
    "could", "should", "may", "might", "can", "shall", "not", "no",
 })
 def _extract_keywords(text: str) -> frozenset[str]:
    """从文本中提取关键词集合
    转小写、按空白/下划线/连字符拆分、过滤停用词和单字符词。
    """
    # 统一分隔符
    normalized = text.lower().replace("_", " ").replace("-", " ")
    words = normalized.split()
    return frozenset(
        w for w in words
        if len(w) > 1 and w not in _STOP_WORDS
    )
 def _determine_warning_level(failure_rate: float) -> WarningLevel:
    """根据失败率确定预警级别
    - HIGH: failure_rate >= 0.5
    - MEDIUM: failure_rate >= 0.2
    - LOW: 有任何失败记录
    """
    if failure_rate >= _HIGH_THRESHOLD:
        return WarningLevel.HIGH
    if failure_rate >= _MEDIUM_THRESHOLD:
        return WarningLevel.MEDIUM
    return WarningLevel.LOW
 def _warning_level_order(level: WarningLevel) -> int:
    """预警级别排序值（越小越严重）"""
    return {
        WarningLevel.HIGH: 0,
        WarningLevel.MEDIUM: 1,
        WarningLevel.LOW: 2,
    }[level]
 def _build_suggestion(stats: _StepFailureStats, failure_rate: float) -> str:
    """根据失败统计生成优化建议"""
    parts: list[str] = []
    if failure_rate >= _HIGH_THRESHOLD:
        parts.append(f"该步骤历史失败率高达 {failure_rate:.0%}，建议重点关注")
    elif failure_rate >= _MEDIUM_THRESHOLD:
        parts.append(f"该步骤历史失败率为 {failure_rate:.0%}，需注意风险")
    else:
        parts.append(f"该步骤有少量失败记录（失败率 {failure_rate:.0%}）")
    if stats.failure_reasons:
        unique_reasons = list(dict.fromkeys(stats.failure_reasons))[:3]
        reasons_str = "、".join(unique_reasons)
        parts.append(f"常见失败原因：{reasons_str}")
    if stats.optimization_tips:
        unique_tips = list(dict.fromkeys(stats.optimization_tips))[:2]
        tips_str = "；".join(unique_tips)
        parts.append(f"建议：{tips_str}")
    return "。".join(parts)
--- a/src/agentkit/tools/init.py
+++ b/src/agentkit/tools/init.py
@ -9,6 +9,10 @@ from agentkit.tools.composition import SequentialChain, ParallelFanOut, DynamicS
 from agentkit.tools.web_crawl import WebCrawlTool
 from agentkit.tools.schema_tools import SchemaExtractTool, SchemaGenerateTool
 from agentkit.tools.baidu_search import BaiduSearchTool
 from agentkit.tools.shell import ShellTool
 from agentkit.tools.terminal_session import TerminalSession, TerminalSessionManager
 from agentkit.tools.pty_session import PTYSession
 from agentkit.tools.output_parser import OutputParser, ParsedOutput, ErrorType
 # Conditional import: HeadroomRetrieveTool requires HeadroomCompressor
 try:
@ -30,4 +34,11 @@ __all__ = [
    "SchemaGenerateTool",
    "BaiduSearchTool",
    "HeadroomRetrieveTool",
    "ShellTool",
    "TerminalSession",
    "TerminalSessionManager",
    "PTYSession",
    "OutputParser",
    "ParsedOutput",
    "ErrorType",
 ]
--- a/src/agentkit/tools/output_parser.py
+++ b/src/agentkit/tools/output_parser.py
@ -0,0 +1,294 @@
 """OutputParser - 结构化解析命令输出
 将命令行输出解析为结构化格式，包含错误类型识别、退出码含义和可操作建议。
 """
 from __future__ import annotations
 import re
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any
 class ErrorType(Enum):
    """命令输出错误类型"""
    NONE = "none"
    PERMISSION_DENIED = "permission_denied"
    NOT_FOUND = "not_found"
    TIMEOUT = "timeout"
    SYNTAX_ERROR = "syntax_error"
    CONNECTION_REFUSED = "connection_refused"
    OUT_OF_MEMORY = "out_of_memory"
    DISK_FULL = "disk_full"
    ALREADY_EXISTS = "already_exists"
    INVALID_ARGUMENT = "invalid_argument"
    PROCESS_NOT_FOUND = "process_not_found"
    NETWORK_ERROR = "network_error"
    UNKNOWN = "unknown"
@dataclass
 class ParsedOutput:
    """结构化命令输出
    Attributes:
        exit_code: 命令退出码
        is_error: 是否为错误输出
        error_type: 错误类型（仅当 is_error=True 时有值）
        message: 输出消息摘要
        raw_output: 原始输出文本
        suggestions: 可操作建议列表
    """
    exit_code: int
    is_error: bool
    error_type: ErrorType = ErrorType.NONE
    message: str = ""
    raw_output: str = ""
    suggestions: list[str] = field(default_factory=list)
    def to_dict(self) -> dict[str, Any]:
        return {
            "exit_code": self.exit_code,
            "is_error": self.is_error,
            "error_type": self.error_type.value,
            "message": self.message,
            "suggestions": self.suggestions,
        }
 # 错误模式匹配规则：(pattern, error_type, message_template, suggestions)
 _ERROR_PATTERNS: list[tuple[re.Pattern, ErrorType, str, list[str]]] = [
    (
        re.compile(r"permission denied|access denied|权限不足|拒绝访问", re.IGNORECASE),
        ErrorType.PERMISSION_DENIED,
        "权限不足",
        [
            "尝试使用 sudo 执行该命令",
            "检查文件/目录权限: ls -la <path>",
            "确认当前用户是否有所需权限",
        ],
    ),
    (
        re.compile(
            r"not found|no such file|no such directory|找不到|不存在|无法找到",
            re.IGNORECASE,
        ),
        ErrorType.NOT_FOUND,
        "文件或目录不存在",
        [
            "检查路径拼写是否正确",
            "使用 ls 确认文件/目录是否存在",
            "检查是否在正确的工作目录下",
        ],
    ),
    (
        re.compile(r"timed?\s*out|timeout|超时|时间超限", re.IGNORECASE),
        ErrorType.TIMEOUT,
        "命令执行超时",
        [
            "增加超时时间",
            "检查网络连接是否正常",
            "检查目标服务是否可达",
        ],
    ),
    (
        re.compile(
            r"syntax error|syntaxerror|parse error|语法错误|解析错误",
            re.IGNORECASE,
        ),
        ErrorType.SYNTAX_ERROR,
        "语法错误",
        [
            "检查命令语法是否正确",
            "使用 --help 查看命令用法",
            "检查引号和特殊字符是否正确转义",
        ],
    ),
    (
        re.compile(
            r"connection refused|连接被拒绝|无法连接|ECONNREFUSED",
            re.IGNORECASE,
        ),
        ErrorType.CONNECTION_REFUSED,
        "连接被拒绝",
        [
            "检查目标服务是否已启动",
            "确认端口号是否正确",
            "检查防火墙设置是否阻止了连接",
        ],
    ),
    (
        re.compile(
            r"out of memory|oom|cannot allocate|内存不足|内存溢出",
            re.IGNORECASE,
        ),
        ErrorType.OUT_OF_MEMORY,
        "内存不足",
        [
            "释放不必要的内存占用",
            "增加系统可用内存",
            "检查是否有内存泄漏",
        ],
    ),
    (
        re.compile(
            r"no space left|disk full|磁盘已满|空间不足|ENOSPC",
            re.IGNORECASE,
        ),
        ErrorType.DISK_FULL,
        "磁盘空间不足",
        [
            "清理不必要的文件: du -sh * | sort -rh | head",
            "检查磁盘使用情况: df -h",
            "删除临时文件或日志",
        ],
    ),
    (
        re.compile(
            r"already exists|file exists|已存在|重复|EEXIST",
            re.IGNORECASE,
        ),
        ErrorType.ALREADY_EXISTS,
        "资源已存在",
        [
            "使用 -f 参数强制覆盖（如适用）",
            "先删除已有资源再重新创建",
            "使用不同名称创建",
        ],
    ),
    (
        re.compile(
            r"invalid argument|illegal option|bad option|无效参数|非法选项|invalid option",
            re.IGNORECASE,
        ),
        ErrorType.INVALID_ARGUMENT,
        "无效参数",
        [
            "检查命令参数是否正确",
            "使用 --help 查看支持的参数",
            "确认参数值类型和范围",
        ],
    ),
    (
        re.compile(
            r"no such process|process not found|进程不存在|进程未找到",
            re.IGNORECASE,
        ),
        ErrorType.PROCESS_NOT_FOUND,
        "进程不存在",
        [
            "确认进程 ID 是否正确",
            "使用 ps aux 查看运行中的进程",
            "进程可能已经结束",
        ],
    ),
    (
        re.compile(
            r"network is unreachable|no route to host|name resolution|网络不可达|无法解析|ENETUNREACH",
            re.IGNORECASE,
        ),
        ErrorType.NETWORK_ERROR,
        "网络错误",
        [
            "检查网络连接是否正常",
            "确认 DNS 解析是否正常: nslookup <domain>",
            "检查代理设置",
        ],
    ),
 ]
 class OutputParser:
    """命令输出结构化解析器
    将命令行输出（stdout + stderr）和退出码解析为结构化的 ParsedOutput，
    包含错误类型识别、消息摘要和可操作建议。
    """
    def parse(self, output: str, exit_code: int) -> ParsedOutput:
        """解析命令输出
        Args:
            output: 命令的标准输出和错误输出合并文本
            exit_code: 命令退出码
        Returns:
            ParsedOutput 结构化解析结果
        """
        is_error = exit_code != 0
        message = self._extract_message(output)
        error_type = ErrorType.NONE
        suggestions: list[str] = []
        if is_error:
            error_type, suggestions = self._classify_error(output, exit_code)
        return ParsedOutput(
            exit_code=exit_code,
            is_error=is_error,
            error_type=error_type,
            message=message,
            raw_output=output,
            suggestions=suggestions,
        )
    def _extract_message(self, output: str) -> str:
        """从输出中提取关键消息
        取最后几行非空输出中的关键行作为消息摘要。
        """
        if not output:
            return ""
        lines = [line.strip() for line in output.strip().splitlines() if line.strip()]
        if not lines:
            return ""
        # 取最后一行作为摘要，如果太长则截断
        message = lines[-1]
        if len(message) > 200:
            message = message[:200] + "..."
        return message
    def _classify_error(
        self, output: str, exit_code: int
    ) -> tuple[ErrorType, list[str]]:
        """根据输出内容和退出码分类错误类型
        Args:
            output: 命令输出
            exit_code: 退出码
        Returns:
            (error_type, suggestions) 元组
        """
        # 优先根据输出内容匹配
        for pattern, error_type, _msg, suggestions in _ERROR_PATTERNS:
            if pattern.search(output):
                return error_type, suggestions
        # 退出码兜底分类
        if exit_code == 126:
            return ErrorType.PERMISSION_DENIED, [
                "检查文件是否有执行权限: chmod +x <file>",
                "确认文件格式是否正确（如行尾符）",
            ]
        if exit_code == 127:
            return ErrorType.NOT_FOUND, [
                "检查命令是否已安装",
                "确认命令名称拼写是否正确",
                "检查 PATH 环境变量是否包含命令所在目录",
            ]
        if exit_code == 130:
            return ErrorType.TIMEOUT, [
                "命令被 Ctrl+C 中断",
                "可能需要增加超时时间",
            ]
        return ErrorType.UNKNOWN, [
            "检查命令输出中的错误信息",
            "使用 --verbose 或 --debug 获取更多详情",
        ]
--- a/src/agentkit/tools/pty_session.py
+++ b/src/agentkit/tools/pty_session.py
@ -0,0 +1,341 @@
 """PTYSession - 伪终端会话，支持交互式命令
 基于 asyncio + os.openpty() 实现伪终端，支持交互式命令和自动应答。
 不依赖 pexpect，仅使用标准库。
 """
 from __future__ import annotations
 import asyncio
 import fcntl
 import logging
 import os
 import struct
 import termios
 import time
 from dataclasses import dataclass
 logger = logging.getLogger(__name__)
 # 自动应答规则：(prompt_pattern, response)
 _AUTO_RESPOND_RULES: list[tuple[str, str]] = [
    (r"\[y/N\]\s*$", "y"),
    (r"\[Y/n\]\s*$", "y"),
    (r"\[yes/no\]\s*$", "yes"),
    (r"\(yes/no\)\s*$", "yes"),
    (r"\(yes/no/\[fingerprint\]\)\s*$", "yes"),
    (r"continue\?\s*$", "y"),
    (r"are you sure\?\s*$", "y"),
    (r"password:\s*$", ""),  # 密码提示不自动应答，需要人工介入
    (r"passphrase:\s*$", ""),
 ]
@dataclass
 class PTYOutput:
    """PTY 输出结果
    Attributes:
        output: 输出文本
        exit_code: 退出码（-1 表示超时或未结束）
        timed_out: 是否超时
    """
    output: str
    exit_code: int = -1
    timed_out: bool = False
 class PTYSession:
    """伪终端会话 - 支持交互式命令
    使用 os.openpty() 创建伪终端对，通过 asyncio 异步读写。
    支持自动检测提示并应答（如 yes/no 确认）。
    Usage:
        pty = PTYSession()
        await pty.start()
        output = await pty.run_command("ssh-keygen", timeout=10)
        await pty.close()
    """
    def __init__(
        self,
        auto_respond: bool = True,
        custom_rules: list[tuple[str, str]] | None = None,
        default_timeout: float = 30.0,
        buffer_size: int = 4096,
    ):
        """初始化 PTY 会话
        Args:
            auto_respond: 是否自动应答已知提示
            custom_rules: 自定义应答规则列表 [(prompt_pattern, response)]
            default_timeout: 默认超时时间（秒）
            buffer_size: 读取缓冲区大小
        """
        self._auto_respond = auto_respond
        self._respond_rules = list(_AUTO_RESPOND_RULES)
        if custom_rules:
            self._respond_rules.extend(custom_rules)
        self._default_timeout = default_timeout
        self._buffer_size = buffer_size
        self._master_fd: int | None = None
        self._slave_fd: int | None = None
        self._process: asyncio.subprocess.Process | None = None
        self._running = False
        self._output_buffer = ""
    @property
    def is_running(self) -> bool:
        """PTY 会话是否已启动（伪终端已创建）"""
        return self._running
    async def start(self) -> None:
        """启动 PTY 会话（创建伪终端对）
        在执行命令前调用，创建 master/slave 文件描述符。
        """
        if self._running:
            return
        self._master_fd, self._slave_fd = os.openpty()
        # 设置终端大小
        try:
            winsize = struct.pack("HHHH", 24, 80, 0, 0)
            fcntl.ioctl(self._slave_fd, termios.TIOCSWINSZ, winsize)
        except Exception:
            pass
        # 设置 master fd 为非阻塞
        flags = fcntl.fcntl(self._master_fd, fcntl.F_GETFL)
        fcntl.fcntl(self._master_fd, fcntl.F_SETFL, flags | os.O_NONBLOCK)
        self._running = True
        logger.debug("PTY session started: master=%d slave=%d", self._master_fd, self._slave_fd)
    async def run_command(
        self,
        command: str,
        timeout: float | None = None,
        cwd: str | None = None,
        env: dict[str, str] | None = None,
    ) -> PTYOutput:
        """在 PTY 中运行命令，等待完成
        Args:
            command: 要执行的命令
            timeout: 超时时间（秒）
            cwd: 工作目录
            env: 环境变量
        Returns:
            PTYOutput 输出结果
        """
        if not self._running:
            await self.start()
        timeout = timeout or self._default_timeout
        self._output_buffer = ""
        # 构建环境
        cmd_env = dict(os.environ)
        if env:
            cmd_env.update(env)
        # 启动子进程，使用 slave 作为 stdin/stdout/stderr
        self._process = await asyncio.create_subprocess_shell(
            command,
            stdin=self._slave_fd,
            stdout=self._slave_fd,
            stderr=self._slave_fd,
            cwd=cwd,
            env=cmd_env,
            start_new_session=True,
        )
        # 关闭 slave 端（子进程已继承）
        os.close(self._slave_fd)
        self._slave_fd = None
        # 异步读取输出
        try:
            exit_code = await self._read_until_exit(timeout)
        except asyncio.TimeoutError:
            self._output_buffer += "\n[PTY 命令执行超时]"
            return PTYOutput(
                output=self._output_buffer,
                exit_code=-1,
                timed_out=True,
            )
        return PTYOutput(
            output=self._output_buffer,
            exit_code=exit_code,
            timed_out=False,
        )
    async def send(self, line: str) -> None:
        """向 PTY 发送一行输入
        Args:
            line: 要发送的文本（自动追加换行符）
        """
        if self._master_fd is None:
            return
        data = (line + "\n").encode("utf-8")
        try:
            os.write(self._master_fd, data)
        except OSError as e:
            logger.warning("PTY write failed: %s", e)
    async def read_output(self, timeout: float = 1.0) -> str:
        """读取当前可用的 PTY 输出
        Args:
            timeout: 读取超时（秒）
        Returns:
            读取到的输出文本
        """
        if self._master_fd is None:
            return ""
        output = ""
        deadline = time.monotonic() + timeout
        while time.monotonic() < deadline:
            try:
                chunk = os.read(self._master_fd, self._buffer_size)
                if chunk:
                    text = chunk.decode("utf-8", errors="replace")
                    output += text
                    self._output_buffer += text
                    # 自动应答
                    if self._auto_respond:
                        await self._try_auto_respond(text)
            except (BlockingIOError, OSError):
                # 没有数据可读
                await asyncio.sleep(0.05)
                continue
            if output:
                # 读取到数据后短暂等待看是否还有更多
                await asyncio.sleep(0.05)
        return output
    async def close(self) -> None:
        """关闭 PTY 会话，清理资源"""
        self._running = False
        if self._process is not None and self._process.returncode is None:
            try:
                self._process.terminate()
                await asyncio.wait_for(self._process.wait(), timeout=5.0)
            except (asyncio.TimeoutError, ProcessLookupError):
                try:
                    self._process.kill()
                except ProcessLookupError:
                    pass
        if self._master_fd is not None:
            try:
                os.close(self._master_fd)
            except OSError:
                pass
            self._master_fd = None
        if self._slave_fd is not None:
            try:
                os.close(self._slave_fd)
            except OSError:
                pass
            self._slave_fd = None
        self._process = None
        logger.debug("PTY session closed")
    async def _read_until_exit(self, timeout: float) -> int:
        """持续读取输出直到进程退出
        Args:
            timeout: 超时时间（秒）
        Returns:
            进程退出码
        """
        deadline = time.monotonic() + timeout
        while True:
            # 检查超时
            if time.monotonic() > deadline:
                raise asyncio.TimeoutError()
            # 检查进程是否已退出
            if self._process.returncode is not None:
                # 进程已退出，再读一次剩余输出
                await self._drain_remaining_output()
                return self._process.returncode
            # 读取输出
            try:
                chunk = os.read(self._master_fd, self._buffer_size)
                if chunk:
                    text = chunk.decode("utf-8", errors="replace")
                    self._output_buffer += text
                    # 自动应答
                    if self._auto_respond:
                        await self._try_auto_respond(text)
            except (BlockingIOError, OSError):
                pass
            # 检查进程状态
            if self._process.returncode is None:
                try:
                    await asyncio.wait_for(
                        self._process.wait(), timeout=0.05
                    )
                except asyncio.TimeoutError:
                    pass
            else:
                await self._drain_remaining_output()
                return self._process.returncode
            await asyncio.sleep(0.02)
    async def _drain_remaining_output(self) -> None:
        """排空剩余输出"""
        for _ in range(10):  # 最多尝试 10 次
            try:
                chunk = os.read(self._master_fd, self._buffer_size)
                if chunk:
                    text = chunk.decode("utf-8", errors="replace")
                    self._output_buffer += text
                else:
                    break
            except (BlockingIOError, OSError):
                break
            await asyncio.sleep(0.01)
    async def _try_auto_respond(self, recent_output: str) -> None:
        """检测提示并自动应答
        Args:
            recent_output: 最近的输出文本
        """
        import re
        for pattern, response in self._respond_rules:
            if not response:
                # 空响应规则（如密码提示）跳过
                continue
            if re.search(pattern, recent_output, re.IGNORECASE | re.MULTILINE):
                logger.debug("Auto-responding to prompt '%s' with '%s'", pattern, response)
                await self.send(response)
                break
--- a/src/agentkit/tools/shell.py
+++ b/src/agentkit/tools/shell.py
@ -0,0 +1,432 @@
 """ShellTool - Shell 命令执行工具
 支持无会话模式（向后兼容）和有会话模式（跨命令保持状态）。
 危险命令通过确认回调请求人工确认，所有操作记录审计日志。
 """
 from __future__ import annotations
 import asyncio
 import logging
 import os
 import time
 from typing import Any, Callable, Awaitable
 from agentkit.tools.base import Tool
 from agentkit.tools.output_parser import OutputParser, ParsedOutput
 from agentkit.tools.terminal_session import TerminalSession, TerminalSessionManager
 from agentkit.tools.pty_session import PTYSession
 logger = logging.getLogger(__name__)
 # 安全白名单：这些命令前缀不需要确认
 _SAFE_COMMAND_PREFIXES: tuple[str, ...] = (
    "ls",
    "cat",
    "head",
    "tail",
    "grep",
    "find",
    "pwd",
    "echo",
    "which",
    "whoami",
    "id",
    "date",
    "uname",
    "df",
    "du",
    "free",
    "ps",
    "top",
    "env",
    "printenv",
    "type",
    "file",
    "stat",
    "wc",
    "sort",
    "uniq",
    "diff",
    "git status",
    "git log",
    "git diff",
    "git branch",
    "git remote",
    "pip list",
    "pip show",
    "python --version",
    "python3 --version",
    "node --version",
    "npm list",
    "docker ps",
    "docker images",
    "curl",
    "wget",
 )
 # 危险命令模式：这些命令需要人工确认
 _DANGEROUS_PATTERNS: tuple[str, ...] = (
    "rm ",
    "rm -",
    "rmdir",
    "mkfs",
    "dd ",
    "format",
    "del ",
    "erase",
    "> /dev/",
    "shutdown",
    "reboot",
    "init 0",
    "init 6",
    "kill -9",
    "killall",
    "chmod 777",
    "chown",
    "mv /",
    "pip uninstall",
    "npm uninstall",
    "apt remove",
    "yum remove",
    "brew uninstall",
    "docker rm",
    "docker rmi",
    "git push --force",
    "git reset --hard",
    "git clean -f",
    "drop table",
    "drop database",
    "truncate",
 )
 class ShellTool(Tool):
    """Shell 命令执行工具
    支持两种模式：
    1. 无会话模式（默认）：每次命令独立执行，不保持状态
    2. 有会话模式：通过 session_id 指定会话，跨命令保持 cwd/env/history
    安全控制：
    - 危险命令通过 confirm_callback 请求人工确认
    - 所有操作记录审计日志
    Usage:
        # 无会话模式
        tool = ShellTool()
        result = await tool.execute(command="ls -la")
        # 有会话模式
        result = await tool.execute(command="cd /tmp", session_id="build-01")
        result = await tool.execute(command="pwd", session_id="build-01")  # 输出 /tmp
    """
    def __init__(
        self,
        name: str = "shell",
        description: str = "执行 Shell 命令，支持会话模式保持跨命令状态",
        input_schema: dict[str, Any] | None = None,
        output_schema: dict[str, Any] | None = None,
        version: str = "1.0.0",
        tags: list[str] | None = None,
        confirm_callback: Callable[[str], Awaitable[bool]] | None = None,
        default_timeout: float = 60.0,
        max_output_length: int = 50000,
    ):
        super().__init__(
            name=name,
            description=description,
            input_schema=input_schema or self._default_input_schema(),
            output_schema=output_schema or self._default_output_schema(),
            version=version,
            tags=tags or ["shell", "terminal", "system"],
        )
        self._session_manager = TerminalSessionManager()
        self._output_parser = OutputParser()
        self._confirm_callback = confirm_callback
        self._default_timeout = default_timeout
        self._max_output_length = max_output_length
        self._audit_log: list[dict[str, Any]] = []
    @staticmethod
    def _default_input_schema() -> dict[str, Any]:
        return {
            "type": "object",
            "properties": {
                "command": {
                    "type": "string",
                    "description": "要执行的 Shell 命令",
                },
                "timeout": {
                    "type": "number",
                    "description": "超时时间（秒），默认 60",
                    "default": 60,
                },
                "working_dir": {
                    "type": "string",
                    "description": "工作目录（仅无会话模式有效）",
                },
                "session_id": {
                    "type": "string",
                    "description": "会话 ID，指定后在会话中执行命令，跨命令保持状态",
                },
                "interactive": {
                    "type": "boolean",
                    "description": "是否使用交互式模式（PTY），用于需要用户输入的命令",
                    "default": False,
                },
            },
            "required": ["command"],
        }
    @staticmethod
    def _default_output_schema() -> dict[str, Any]:
        return {
            "type": "object",
            "properties": {
                "output": {"type": "string", "description": "命令输出"},
                "exit_code": {"type": "integer", "description": "退出码"},
                "is_error": {"type": "boolean", "description": "是否为错误"},
                "error_type": {"type": "string", "description": "错误类型"},
                "message": {"type": "string", "description": "消息摘要"},
                "suggestions": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": "可操作建议",
                },
                "session_id": {"type": "string", "description": "会话 ID（仅会话模式）"},
            },
        }
    async def execute(self, **kwargs) -> dict:
        """执行 Shell 命令
        Args:
            command: 要执行的命令（必需）
            timeout: 超时时间（秒）
            working_dir: 工作目录（仅无会话模式）
            session_id: 会话 ID（启用会话模式）
            interactive: 是否使用交互式模式
        Returns:
            包含 output, exit_code, is_error 等字段的字典
        """
        command = kwargs.get("command")
        if not command:
            return {
                "output": "",
                "exit_code": 1,
                "is_error": True,
                "error_type": "invalid_argument",
                "message": "command 参数是必需的",
                "suggestions": ["提供要执行的 Shell 命令"],
            }
        timeout = kwargs.get("timeout", self._default_timeout)
        working_dir = kwargs.get("working_dir")
        session_id = kwargs.get("session_id")
        interactive = kwargs.get("interactive", False)
        # 安全检查：危险命令需要确认
        if self._is_dangerous(command):
            confirmed = await self._request_confirmation(command)
            if not confirmed:
                self._log_audit(command, None, blocked=True)
                return {
                    "output": "",
                    "exit_code": 126,
                    "is_error": True,
                    "error_type": "permission_denied",
                    "message": f"危险命令已被拒绝执行: {command[:100]}",
                    "suggestions": [
                        "如需执行此命令，请手动确认",
                        "考虑使用更安全的替代命令",
                    ],
                }
        # 根据模式执行
        if session_id:
            result = await self._execute_in_session(
                command, session_id, timeout, working_dir, interactive
            )
        else:
            result = await self._execute_standalone(command, timeout, working_dir, interactive)
        # 审计日志
        self._log_audit(command, session_id, exit_code=result.exit_code)
        # 截断过长输出
        output = result.raw_output
        if len(output) > self._max_output_length:
            output = output[: self._max_output_length] + "\n... [输出已截断]"
        return {
            "output": output,
            "exit_code": result.exit_code,
            "is_error": result.is_error,
            "error_type": result.error_type.value,
            "message": result.message,
            "suggestions": result.suggestions,
            "session_id": session_id,
        }
    async def _execute_standalone(
        self,
        command: str,
        timeout: float,
        working_dir: str | None,
        interactive: bool,
    ) -> ParsedOutput:
        """无会话模式执行命令（向后兼容）"""
        if interactive:
            return await self._execute_with_pty(command, timeout, working_dir)
        start = time.monotonic()
        try:
            proc = await asyncio.create_subprocess_shell(
                command,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.STDOUT,
                cwd=working_dir,
            )
            try:
                stdout, _ = await asyncio.wait_for(
                    proc.communicate(),
                    timeout=timeout,
                )
            except asyncio.TimeoutError:
                proc.kill()
                await proc.wait()
                output = f"命令执行超时（{timeout}s）"
                exit_code = -1
            else:
                output = stdout.decode("utf-8", errors="replace") if stdout else ""
                exit_code = proc.returncode if proc.returncode is not None else 0
        except Exception as e:
            output = str(e)
            exit_code = -1
        return self._output_parser.parse(output, exit_code)
    async def _execute_in_session(
        self,
        command: str,
        session_id: str,
        timeout: float,
        working_dir: str | None,
        interactive: bool,
    ) -> ParsedOutput:
        """会话模式执行命令"""
        session = self._session_manager.get_or_create(
            session_id,
            cwd=working_dir,
        )
        if interactive:
            return await self._execute_with_pty(
                command, timeout, session.cwd, session.env
            )
        return await session.execute(command, timeout=timeout)
    async def _execute_with_pty(
        self,
        command: str,
        timeout: float,
        cwd: str | None = None,
        env: dict[str, str] | None = None,
    ) -> ParsedOutput:
        """使用 PTY 执行交互式命令"""
        pty = PTYSession()
        try:
            await pty.start()
            result = await pty.run_command(
                command,
                timeout=timeout,
                cwd=cwd,
                env=env,
            )
            output = result.output
            exit_code = result.exit_code
        except Exception as e:
            output = str(e)
            exit_code = -1
        finally:
            await pty.close()
        return self._output_parser.parse(output, exit_code)
    def _is_dangerous(self, command: str) -> bool:
        """检查命令是否为危险操作
        白名单命令直接放行，其他命令检查是否匹配危险模式。
        """
        command_stripped = command.strip()
        # 白名单检查
        for prefix in _SAFE_COMMAND_PREFIXES:
            if command_stripped.startswith(prefix):
                return False
        # 危险模式检查
        command_lower = command_stripped.lower()
        for pattern in _DANGEROUS_PATTERNS:
            if pattern in command_lower:
                return True
        return False
    async def _request_confirmation(self, command: str) -> bool:
        """请求人工确认危险命令
        Args:
            command: 待确认的命令
        Returns:
            是否确认执行
        """
        if self._confirm_callback:
            try:
                return await self._confirm_callback(command)
            except Exception as e:
                logger.warning("确认回调执行失败: %s", e)
                return False
        # 无回调时默认拒绝
        logger.warning("危险命令被拒绝（无确认回调）: %s", command[:100])
        return False
    def _log_audit(
        self,
        command: str,
        session_id: str | None,
        exit_code: int | None = None,
        blocked: bool = False,
    ) -> None:
        """记录审计日志"""
        entry = {
            "timestamp": time.time(),
            "command": command[:500],
            "session_id": session_id,
            "exit_code": exit_code,
            "blocked": blocked,
        }
        self._audit_log.append(entry)
        logger.info(
            "Shell audit: command=%r session=%s exit=%s blocked=%s",
            command[:100],
            session_id,
            exit_code,
            blocked,
        )
    @property
    def session_manager(self) -> TerminalSessionManager:
        """获取会话管理器"""
        return self._session_manager
    @property
    def audit_log(self) -> list[dict[str, Any]]:
        """获取审计日志（副本）"""
        return list(self._audit_log)
--- a/src/agentkit/tools/terminal_session.py
+++ b/src/agentkit/tools/terminal_session.py
@ -0,0 +1,352 @@
 """TerminalSession - 终端会话状态管理
 维护 cwd、env、history，支持跨命令保持状态。
 通过在命令前注入 cd 和 export 语句实现跨命令状态持久化。
 """
 from __future__ import annotations
 import asyncio
 import logging
 import os
 import time
 from dataclasses import dataclass, field
 from typing import Any
 from agentkit.tools.output_parser import OutputParser, ParsedOutput
 logger = logging.getLogger(__name__)
@dataclass
 class CommandRecord:
    """命令执行记录
    Attributes:
        command: 执行的命令
        exit_code: 退出码
        output: 标准输出+错误输出
        cwd: 执行时的工作目录
        timestamp: 执行时间戳
        duration_ms: 执行耗时（毫秒）
    """
    command: str
    exit_code: int
    output: str
    cwd: str
    timestamp: float
    duration_ms: int
 class TerminalSession:
    """终端会话 - 跨命令保持 cwd/env/history 状态
    通过在命令前注入 `cd {cwd} && ` 和 `export K=V && ` 实现跨命令状态持久化。
    每次命令执行后自动更新 cwd 和 env 状态。
    Usage:
        session = TerminalSession(session_id="build-01")
        result = await session.execute("cd /tmp")
        result = await session.execute("pwd")  # 输出 /tmp
    """
    def __init__(
        self,
        session_id: str,
        cwd: str | None = None,
        env: dict[str, str] | None = None,
        max_history: int = 1000,
    ):
        self.session_id = session_id
        self._cwd = cwd or os.getcwd()
        self._env: dict[str, str] = dict(env or os.environ)
        self._history: list[CommandRecord] = []
        self._max_history = max_history
        self._output_parser = OutputParser()
        self._created_at = time.time()
    @property
    def cwd(self) -> str:
        """当前工作目录"""
        return self._cwd
    @property
    def env(self) -> dict[str, str]:
        """当前环境变量（副本）"""
        return dict(self._env)
    @property
    def history(self) -> list[CommandRecord]:
        """命令执行历史（副本）"""
        return list(self._history)
    @property
    def created_at(self) -> float:
        """会话创建时间戳"""
        return self._created_at
    def get_cwd(self) -> str:
        """获取当前工作目录"""
        return self._cwd
    def set_cwd(self, cwd: str) -> None:
        """手动设置当前工作目录"""
        self._cwd = cwd
    def get_env(self) -> dict[str, str]:
        """获取当前环境变量（副本）"""
        return dict(self._env)
    def set_env(self, key: str, value: str) -> None:
        """设置单个环境变量"""
        self._env[key] = value
    def update_env(self, env: dict[str, str]) -> None:
        """批量更新环境变量"""
        self._env.update(env)
    def get_history(self) -> list[CommandRecord]:
        """获取命令执行历史（副本）"""
        return list(self._history)
    async def execute(
        self,
        command: str,
        timeout: float | None = None,
    ) -> ParsedOutput:
        """在会话上下文中执行命令
        自动在命令前注入 cd 和 export 语句以保持会话状态。
        执行后自动更新 cwd 和 env。
        Args:
            command: 要执行的命令
            timeout: 超时时间（秒），None 表示不超时
        Returns:
            ParsedOutput 结构化解析结果
        """
        full_command = self._build_command(command)
        start = time.monotonic()
        try:
            proc = await asyncio.create_subprocess_shell(
                full_command,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.STDOUT,
                env=self._env,
            )
            try:
                stdout, _ = await asyncio.wait_for(
                    proc.communicate(),
                    timeout=timeout,
                )
            except asyncio.TimeoutError:
                proc.kill()
                await proc.wait()
                output = f"命令执行超时（{timeout}s）"
                exit_code = -1
            else:
                output = stdout.decode("utf-8", errors="replace") if stdout else ""
                exit_code = proc.returncode if proc.returncode is not None else 0
        except Exception as e:
            output = str(e)
            exit_code = -1
        duration_ms = int((time.monotonic() - start) * 1000)
        # 更新会话状态
        self._update_state_after_execution(command, output, exit_code)
        # 记录历史
        record = CommandRecord(
            command=command,
            exit_code=exit_code,
            output=output,
            cwd=self._cwd,
            timestamp=time.time(),
            duration_ms=duration_ms,
        )
        self._add_history(record)
        # 解析输出
        parsed = self._output_parser.parse(output, exit_code)
        logger.debug(
            "Session %s: command=%r exit_code=%d duration=%dms",
            self.session_id,
            command,
            exit_code,
            duration_ms,
        )
        return parsed
    def _build_command(self, command: str) -> str:
        """构建带会话状态的完整命令
        在原始命令前注入 cd 和 export 语句。
        """
        parts: list[str] = []
        # 注入 cd
        if self._cwd:
            # 使用 shlex.quote 风格的简单转义
            cwd_escaped = self._cwd.replace("'", "'\\''")
            parts.append(f"cd '{cwd_escaped}'")
        # 注入环境变量
        for key, value in self._env.items():
            # 跳过 os.environ 中已有的且值未变的变量，减少命令长度
            val_escaped = value.replace("'", "'\\''")
            parts.append(f"export {key}='{val_escaped}'")
        parts.append(command)
        return " && ".join(parts)
    def _update_state_after_execution(
        self, command: str, output: str, exit_code: int
    ) -> None:
        """命令执行后更新会话状态
        解析 cd 和 export 命令更新 cwd 和 env。
        """
        if exit_code != 0:
            return
        # 解析 cd 命令更新 cwd
        self._parse_cd_commands(command, output)
        # 解析 export 命令更新 env
        self._parse_export_commands(command)
    def _parse_cd_commands(self, command: str, output: str) -> None:
        """从命令中解析 cd 并更新 cwd
        支持:
        - cd /path
        - cd dir (相对路径，需要通过 pwd 获取实际路径)
        - cd - (切换到上一个目录)
        """
        import re
        # 匹配 cd 命令（可能出现在 && 链中）
        cd_pattern = re.compile(r"(?:^|\s|&&\s*)cd\s+(.+?)(?:\s*&&|\s*$)")
        matches = cd_pattern.findall(command)
        for target in matches:
            target = target.strip().strip("'\"")
            if not target:
                continue
            if target == "-":
                # cd - 切换到 OLDPWD
                old_pwd = self._env.get("OLDPWD")
                if old_pwd:
                    self._cwd = old_pwd
            elif os.path.isabs(target):
                self._cwd = target
            else:
                # 相对路径：拼接后规范化
                new_cwd = os.path.normpath(os.path.join(self._cwd, target))
                self._cwd = new_cwd
    def _parse_export_commands(self, command: str) -> None:
        """从命令中解析 export 并更新 env
        支持:
        - export KEY=VALUE
        - export KEY="VALUE WITH SPACES"
        """
        import re
        export_pattern = re.compile(
            r"(?:^|\s|&&\s*)export\s+(\w+)=(.+?)(?:\s*&&|\s*$)"
        )
        matches = export_pattern.findall(command)
        for key, value in matches:
            value = value.strip().strip("'\"")
            self._env[key] = value
    def _add_history(self, record: CommandRecord) -> None:
        """添加命令记录到历史，超出上限时移除最旧记录"""
        self._history.append(record)
        while len(self._history) > self._max_history:
            self._history.pop(0)
    def close(self) -> None:
        """关闭会话，清理资源"""
        logger.info(
            "Session %s closed: %d commands executed",
            self.session_id,
            len(self._history),
        )
 class TerminalSessionManager:
    """终端会话管理器 - 按 ID 管理多个 TerminalSession
    Usage:
        manager = TerminalSessionManager()
        session = manager.get_or_create("build-01")
        session = manager.get("build-01")
        manager.remove("build-01")
    """
    def __init__(self, max_sessions: int = 100):
        self._sessions: dict[str, TerminalSession] = {}
        self._max_sessions = max_sessions
    def get_or_create(
        self,
        session_id: str,
        cwd: str | None = None,
        env: dict[str, str] | None = None,
    ) -> TerminalSession:
        """获取或创建会话
        Args:
            session_id: 会话 ID
            cwd: 初始工作目录（仅创建时使用）
            env: 初始环境变量（仅创建时使用）
        Returns:
            TerminalSession 实例
        """
        if session_id not in self._sessions:
            if len(self._sessions) >= self._max_sessions:
                # 移除最旧的会话
                oldest_id = min(
                    self._sessions, key=lambda k: self._sessions[k].created_at
                )
                self.remove(oldest_id)
            self._sessions[session_id] = TerminalSession(
                session_id=session_id,
                cwd=cwd,
                env=env,
            )
            logger.info("Session created: %s", session_id)
        return self._sessions[session_id]
    def get(self, session_id: str) -> TerminalSession | None:
        """获取会话，不存在返回 None"""
        return self._sessions.get(session_id)
    def remove(self, session_id: str) -> None:
        """移除并关闭会话"""
        session = self._sessions.pop(session_id, None)
        if session:
            session.close()
    def list_sessions(self) -> list[str]:
        """列出所有会话 ID"""
        return list(self._sessions.keys())
    def has_session(self, session_id: str) -> bool:
        """检查会话是否存在"""
        return session_id in self._sessions
    def close_all(self) -> None:
        """关闭所有会话"""
        for session_id in list(self._sessions.keys()):
            self.remove(session_id)
--- a/tests/unit/evolution/test_path_optimizer.py
+++ b/tests/unit/evolution/test_path_optimizer.py
@ -0,0 +1,512 @@
 """Tests for PathOptimizer - 执行路径优化器"""
 from __future__ import annotations
 from datetime import datetime, timezone
 import pytest
 from agentkit.evolution.path_optimizer import ExecutionPath, PathOptimizer, PathUpdateResult
 # ── Fixtures ──────────────────────────────────────────────
@pytest.fixture
 def optimizer():
    """默认 PathOptimizer 实例"""
    return PathOptimizer(min_sample_count=3, success_rate_threshold=0.05, duration_improvement_threshold=0.2)
@pytest.fixture
 def optimizer_custom_thresholds():
    """自定义阈值的 PathOptimizer"""
    return PathOptimizer(
        min_sample_count=5,
        success_rate_threshold=0.1,
        duration_improvement_threshold=0.3,
    )
 def _make_path(
    task_type: str = "code_review",
    steps: list[str] | None = None,
    total_duration: float = 10.0,
    success_rate: float = 0.8,
    sample_count: int = 5,
    is_recommended: bool = False,
    path_id: str = "",
    created_at: datetime | None = None,
 ) -> ExecutionPath:
    """创建测试用 ExecutionPath"""
    return ExecutionPath(
        path_id=path_id,
        task_type=task_type,
        steps=steps or ["step1", "step2", "step3"],
        total_duration=total_duration,
        success_rate=success_rate,
        sample_count=sample_count,
        is_recommended=is_recommended,
        created_at=created_at or datetime.now(timezone.utc),
    )
 # ── ExecutionPath 数据模型测试 ────────────────────────────
 class TestExecutionPath:
    def test_default_values(self):
        path = ExecutionPath()
        assert path.path_id == ""
        assert path.task_type == ""
        assert path.steps == []
        assert path.total_duration == 0.0
        assert path.success_rate == 0.0
        assert path.sample_count == 0
        assert path.is_recommended is False
        assert isinstance(path.created_at, datetime)
    def test_custom_values(self):
        now = datetime.now(timezone.utc)
        path = ExecutionPath(
            path_id="p1",
            task_type="code_review",
            steps=["analyze", "review", "report"],
            total_duration=15.5,
            success_rate=0.9,
            sample_count=10,
            is_recommended=True,
            created_at=now,
        )
        assert path.path_id == "p1"
        assert path.task_type == "code_review"
        assert path.steps == ["analyze", "review", "report"]
        assert path.total_duration == 15.5
        assert path.success_rate == 0.9
        assert path.sample_count == 10
        assert path.is_recommended is True
        assert path.created_at == now
 # ── PathUpdateResult 数据模型测试 ─────────────────────────
 class TestPathUpdateResult:
    def test_default_values(self):
        result = PathUpdateResult()
        assert result.updated is False
        assert result.old_path is None
        assert result.new_path is None
        assert result.reason == ""
    def test_updated_result(self):
        old = _make_path(success_rate=0.7)
        new = _make_path(success_rate=0.9)
        result = PathUpdateResult(
            updated=True,
            old_path=old,
            new_path=new,
            reason="成功率显著提升",
        )
        assert result.updated is True
        assert result.old_path.success_rate == 0.7
        assert result.new_path.success_rate == 0.9
        assert "成功率" in result.reason
 # ── get_recommended_path 测试 ─────────────────────────────
 class TestGetRecommendedPath:
    async def test_no_recommended_path(self, optimizer):
        result = optimizer.get_recommended_path("code_review")
        assert result is None
    async def test_returns_recommended_path(self, optimizer):
        path = _make_path(task_type="code_review", success_rate=0.8, sample_count=5)
        await optimizer.evaluate_and_update("code_review", path)
        result = optimizer.get_recommended_path("code_review")
        assert result is not None
        assert result.success_rate == 0.8
        assert result.is_recommended is True
    async def test_different_task_types_independent(self, optimizer):
        path_a = _make_path(task_type="code_review", success_rate=0.8, sample_count=5)
        path_b = _make_path(task_type="data_analysis", success_rate=0.9, sample_count=5)
        await optimizer.evaluate_and_update("code_review", path_a)
        await optimizer.evaluate_and_update("data_analysis", path_b)
        result_a = optimizer.get_recommended_path("code_review")
        result_b = optimizer.get_recommended_path("data_analysis")
        assert result_a is not None
        assert result_b is not None
        assert result_a.success_rate == 0.8
        assert result_b.success_rate == 0.9
 # ── 样本量不足测试 ────────────────────────────────────────
 class TestInsufficientSamples:
    async def test_insufficient_samples_no_update(self, optimizer):
        """样本量不足 → 不更新，记录待观察"""
        path = _make_path(sample_count=2, success_rate=0.9)
        result = await optimizer.evaluate_and_update("code_review", path)
        assert result.updated is False
        assert "样本量不足" in result.reason
        assert optimizer.get_recommended_path("code_review") is None
    async def test_insufficient_samples_recorded_as_pending(self, optimizer):
        """样本量不足的路径被记录到待观察列表"""
        path = _make_path(sample_count=2, success_rate=0.9)
        await optimizer.evaluate_and_update("code_review", path)
        pending = optimizer.get_pending_paths("code_review")
        assert len(pending) == 1
        assert pending[0].success_rate == 0.9
    async def test_exact_min_samples_updates(self, optimizer):
        """刚好达到最小样本量 → 可以更新"""
        path = _make_path(sample_count=3, success_rate=0.8)
        result = await optimizer.evaluate_and_update("code_review", path)
        assert result.updated is True
        assert result.reason == "无现有推荐路径，直接设为推荐"
    async def test_custom_min_sample_count(self, optimizer_custom_thresholds):
        """自定义最小样本量"""
        path = _make_path(sample_count=4, success_rate=0.9)
        result = await optimizer_custom_thresholds.evaluate_and_update("code_review", path)
        assert result.updated is False
        assert "样本量不足" in result.reason
 # ── 首次设置推荐路径测试 ──────────────────────────────────
 class TestFirstRecommendation:
    async def test_first_path_becomes_recommended(self, optimizer):
        """无现有推荐路径时，新路径直接设为推荐"""
        path = _make_path(success_rate=0.7, sample_count=5)
        result = await optimizer.evaluate_and_update("code_review", path)
        assert result.updated is True
        assert result.old_path is None
        assert result.new_path is not None
        assert result.new_path.is_recommended is True
        assert "无现有推荐路径" in result.reason
    async def test_auto_generates_path_id(self, optimizer):
        """未提供 path_id 时自动生成"""
        path = _make_path(path_id="", sample_count=5)
        result = await optimizer.evaluate_and_update("code_review", path)
        assert result.updated is True
        assert result.new_path is not None
        assert len(result.new_path.path_id) > 0
 # ── 成功率显著提升测试 ────────────────────────────────────
 class TestSuccessRateImprovement:
    async def test_higher_success_rate_updates(self, optimizer):
        """新路径成功率更高 → 更新推荐路径"""
        old_path = _make_path(success_rate=0.7, sample_count=5)
        await optimizer.evaluate_and_update("code_review", old_path)
        new_path = _make_path(success_rate=0.85, sample_count=5)
        result = await optimizer.evaluate_and_update("code_review", new_path)
        assert result.updated is True
        assert result.old_path.success_rate == 0.7
        assert result.new_path.success_rate == 0.85
        assert "成功率显著提升" in result.reason
    async def test_marginal_success_rate_no_update(self, optimizer):
        """成功率提升不足阈值 → 不更新"""
        old_path = _make_path(success_rate=0.8, sample_count=5)
        await optimizer.evaluate_and_update("code_review", old_path)
        # 提升仅 0.03，低于默认阈值 0.05
        new_path = _make_path(success_rate=0.83, sample_count=5)
        result = await optimizer.evaluate_and_update("code_review", new_path)
        assert result.updated is False
        assert "无明显优势" in result.reason
    async def test_custom_success_rate_threshold(self, optimizer_custom_thresholds):
        """自定义成功率阈值"""
        old_path = _make_path(success_rate=0.7, sample_count=10)
        await optimizer_custom_thresholds.evaluate_and_update("code_review", old_path)
        # 提升 0.08，低于自定义阈值 0.1
        new_path = _make_path(success_rate=0.78, sample_count=10)
        result = await optimizer_custom_thresholds.evaluate_and_update("code_review", new_path)
        assert result.updated is False
    async def test_lower_success_rate_no_update(self, optimizer):
        """新路径成功率更低 → 不更新"""
        old_path = _make_path(success_rate=0.9, sample_count=5)
        await optimizer.evaluate_and_update("code_review", old_path)
        new_path = _make_path(success_rate=0.6, sample_count=5)
        result = await optimizer.evaluate_and_update("code_review", new_path)
        assert result.updated is False
 # ── 耗时显著更短测试 ──────────────────────────────────────
 class TestDurationImprovement:
    async def test_shorter_duration_with_similar_success_rate_updates(self, optimizer):
        """成功率相近但耗时显著更短 → 更新推荐路径"""
        old_path = _make_path(total_duration=100.0, success_rate=0.8, sample_count=5)
        await optimizer.evaluate_and_update("code_review", old_path)
        # 耗时减少 30%（> 20% 阈值），成功率相近
        new_path = _make_path(total_duration=70.0, success_rate=0.82, sample_count=5)
        result = await optimizer.evaluate_and_update("code_review", new_path)
        assert result.updated is True
        assert "耗时显著更短" in result.reason
    async def test_marginal_duration_improvement_no_update(self, optimizer):
        """耗时改善不足阈值 → 不更新"""
        old_path = _make_path(total_duration=100.0, success_rate=0.8, sample_count=5)
        await optimizer.evaluate_and_update("code_review", old_path)
        # 耗时减少仅 10%（< 20% 阈值）
        new_path = _make_path(total_duration=90.0, success_rate=0.82, sample_count=5)
        result = await optimizer.evaluate_and_update("code_review", new_path)
        assert result.updated is False
        assert "无明显优势" in result.reason
    async def test_longer_duration_no_update(self, optimizer):
        """耗时更长 → 不更新"""
        old_path = _make_path(total_duration=50.0, success_rate=0.8, sample_count=5)
        await optimizer.evaluate_and_update("code_review", old_path)
        new_path = _make_path(total_duration=80.0, success_rate=0.82, sample_count=5)
        result = await optimizer.evaluate_and_update("code_review", new_path)
        assert result.updated is False
    async def test_custom_duration_improvement_threshold(self, optimizer_custom_thresholds):
        """自定义耗时改善阈值"""
        old_path = _make_path(total_duration=100.0, success_rate=0.8, sample_count=10)
        await optimizer_custom_thresholds.evaluate_and_update("code_review", old_path)
        # 耗时减少 25%（< 30% 自定义阈值）
        new_path = _make_path(total_duration=75.0, success_rate=0.82, sample_count=10)
        result = await optimizer_custom_thresholds.evaluate_and_update("code_review", new_path)
        assert result.updated is False
    async def test_zero_duration_current_path(self, optimizer):
        """现有路径耗时为 0 → 不因耗时更新"""
        old_path = _make_path(total_duration=0.0, success_rate=0.8, sample_count=5)
        await optimizer.evaluate_and_update("code_review", old_path)
        new_path = _make_path(total_duration=10.0, success_rate=0.82, sample_count=5)
        result = await optimizer.evaluate_and_update("code_review", new_path)
        assert result.updated is False
    async def test_both_zero_duration(self, optimizer):
        """两者耗时均为 0 → 不因耗时更新"""
        old_path = _make_path(total_duration=0.0, success_rate=0.8, sample_count=5)
        await optimizer.evaluate_and_update("code_review", old_path)
        new_path = _make_path(total_duration=0.0, success_rate=0.82, sample_count=5)
        result = await optimizer.evaluate_and_update("code_review", new_path)
        assert result.updated is False
 # ── 保留现有推荐路径测试 ──────────────────────────────────
 class TestKeepCurrentPath:
    async def test_no_advantage_keeps_current(self, optimizer):
        """新路径无明显优势 → 保留现有推荐路径"""
        old_path = _make_path(total_duration=50.0, success_rate=0.8, sample_count=5)
        await optimizer.evaluate_and_update("code_review", old_path)
        new_path = _make_path(total_duration=48.0, success_rate=0.79, sample_count=5)
        result = await optimizer.evaluate_and_update("code_review", new_path)
        assert result.updated is False
        assert result.old_path.success_rate == 0.8
        # 推荐路径不变
        recommended = optimizer.get_recommended_path("code_review")
        assert recommended is not None
        assert recommended.success_rate == 0.8
    async def test_is_recommended_flag_preserved(self, optimizer):
        """未更新时，现有路径的 is_recommended 标志保持为 True"""
        old_path = _make_path(success_rate=0.8, sample_count=5)
        await optimizer.evaluate_and_update("code_review", old_path)
        new_path = _make_path(success_rate=0.79, sample_count=5)
        await optimizer.evaluate_and_update("code_review", new_path)
        recommended = optimizer.get_recommended_path("code_review")
        assert recommended is not None
        assert recommended.is_recommended is True
 # ── is_recommended 标志管理测试 ────────────────────────────
 class TestIsRecommendedFlag:
    async def test_old_path_loses_recommended_flag(self, optimizer):
        """更新后旧路径的 is_recommended 变为 False"""
        old_path = _make_path(success_rate=0.7, sample_count=5)
        await optimizer.evaluate_and_update("code_review", old_path)
        assert old_path.is_recommended is True  # 首次设置，is_recommended 为 True
        new_path = _make_path(success_rate=0.9, sample_count=5)
        result = await optimizer.evaluate_and_update("code_review", new_path)
        assert result.updated is True
        assert result.old_path.is_recommended is False  # 更新后旧路径失去标志
        assert result.new_path.is_recommended is True
 # ── 多次迭代优化测试 ──────────────────────────────────────
 class TestIterativeOptimization:
    async def test_multiple_updates_converge_to_best(self, optimizer):
        """多次迭代后推荐路径收敛到最优"""
        # 第一次：初始路径
        path1 = _make_path(success_rate=0.6, total_duration=100.0, sample_count=5)
        await optimizer.evaluate_and_update("code_review", path1)
        assert optimizer.get_recommended_path("code_review").success_rate == 0.6
        # 第二次：成功率显著提升
        path2 = _make_path(success_rate=0.8, total_duration=90.0, sample_count=5)
        await optimizer.evaluate_and_update("code_review", path2)
        assert optimizer.get_recommended_path("code_review").success_rate == 0.8
        # 第三次：成功率相近但耗时更短
        path3 = _make_path(success_rate=0.82, total_duration=50.0, sample_count=5)
        await optimizer.evaluate_and_update("code_review", path3)
        assert optimizer.get_recommended_path("code_review").total_duration == 50.0
        # 第四次：无明显优势
        path4 = _make_path(success_rate=0.81, total_duration=48.0, sample_count=5)
        result = await optimizer.evaluate_and_update("code_review", path4)
        assert result.updated is False
        assert optimizer.get_recommended_path("code_review").total_duration == 50.0
    async def test_different_task_types_evolve_independently(self, optimizer):
        """不同任务类型的推荐路径独立进化"""
        path_a1 = _make_path(task_type="code_review", success_rate=0.7, sample_count=5)
        path_b1 = _make_path(task_type="data_analysis", success_rate=0.6, sample_count=5)
        await optimizer.evaluate_and_update("code_review", path_a1)
        await optimizer.evaluate_and_update("data_analysis", path_b1)
        path_a2 = _make_path(task_type="code_review", success_rate=0.9, sample_count=5)
        await optimizer.evaluate_and_update("code_review", path_a2)
        # code_review 更新了，data_analysis 不受影响
        assert optimizer.get_recommended_path("code_review").success_rate == 0.9
        assert optimizer.get_recommended_path("data_analysis").success_rate == 0.6
 # ── 待观察路径管理测试 ────────────────────────────────────
 class TestPendingPaths:
    async def test_pending_paths_empty_initially(self, optimizer):
        assert optimizer.get_pending_paths("code_review") == []
    async def test_pending_paths_accumulate(self, optimizer):
        """多次样本不足的路径会累积"""
        path1 = _make_path(sample_count=1, success_rate=0.9)
        path2 = _make_path(sample_count=2, success_rate=0.85)
        await optimizer.evaluate_and_update("code_review", path1)
        await optimizer.evaluate_and_update("code_review", path2)
        pending = optimizer.get_pending_paths("code_review")
        assert len(pending) == 2
    async def test_pending_paths_isolated_by_task_type(self, optimizer):
        """不同任务类型的待观察路径相互隔离"""
        path_a = _make_path(task_type="code_review", sample_count=1, success_rate=0.9)
        path_b = _make_path(task_type="data_analysis", sample_count=1, success_rate=0.8)
        await optimizer.evaluate_and_update("code_review", path_a)
        await optimizer.evaluate_and_update("data_analysis", path_b)
        assert len(optimizer.get_pending_paths("code_review")) == 1
        assert len(optimizer.get_pending_paths("data_analysis")) == 1
    async def test_sufficient_samples_not_pending(self, optimizer):
        """样本量充足的路径不会进入待观察列表"""
        path = _make_path(sample_count=5, success_rate=0.8)
        await optimizer.evaluate_and_update("code_review", path)
        assert optimizer.get_pending_paths("code_review") == []
 # ── ExperienceStore 集成测试 ──────────────────────────────
 class TestExperienceStoreIntegration:
    async def test_with_experience_store(self):
        """PathOptimizer 可以接受 ExperienceStore 实例"""
        from agentkit.evolution.experience_store import InMemoryExperienceStore
        store = InMemoryExperienceStore()
        optimizer = PathOptimizer(experience_store=store, min_sample_count=3)
        path = _make_path(success_rate=0.8, sample_count=5)
        result = await optimizer.evaluate_and_update("code_review", path)
        assert result.updated is True
    async def test_without_experience_store(self, optimizer):
        """PathOptimizer 可以不依赖 ExperienceStore 独立运行"""
        path = _make_path(success_rate=0.8, sample_count=5)
        result = await optimizer.evaluate_and_update("code_review", path)
        assert result.updated is True
 # ── 边界条件测试 ──────────────────────────────────────────
 class TestEdgeCases:
    async def test_same_path_twice(self, optimizer):
        """提交相同路径两次"""
        path = _make_path(success_rate=0.8, sample_count=5)
        result1 = await optimizer.evaluate_and_update("code_review", path)
        assert result1.updated is True
        # 第二次提交相同参数的路径（但不同实例）
        path2 = _make_path(success_rate=0.8, sample_count=5)
        result2 = await optimizer.evaluate_and_update("code_review", path2)
        # 成功率相同，耗时相同 → 无明显优势
        assert result2.updated is False
    async def test_success_rate_at_boundary(self, optimizer):
        """成功率刚好在阈值边界"""
        old_path = _make_path(success_rate=0.8, sample_count=5)
        await optimizer.evaluate_and_update("code_review", old_path)
        # 提升恰好等于阈值 0.05，不满足 > threshold
        new_path = _make_path(success_rate=0.85, sample_count=5)
        result = await optimizer.evaluate_and_update("code_review", new_path)
        assert result.updated is False
    async def test_duration_improvement_at_boundary(self, optimizer):
        """耗时改善刚好在阈值边界"""
        old_path = _make_path(total_duration=100.0, success_rate=0.8, sample_count=5)
        await optimizer.evaluate_and_update("code_review", old_path)
        # 改善恰好等于阈值 20%，不满足 > threshold
        new_path = _make_path(total_duration=80.0, success_rate=0.82, sample_count=5)
        result = await optimizer.evaluate_and_update("code_review", new_path)
        assert result.updated is False
    async def test_zero_sample_count(self, optimizer):
        """样本量为 0"""
        path = _make_path(sample_count=0, success_rate=0.9)
        result = await optimizer.evaluate_and_update("code_review", path)
        assert result.updated is False
        assert "样本量不足" in result.reason
    async def test_path_task_type_override(self, optimizer):
        """evaluate_and_update 会用传入的 task_type 覆盖路径的 task_type"""
        path = _make_path(task_type="wrong_type", success_rate=0.8, sample_count=5)
        result = await optimizer.evaluate_and_update("code_review", path)
        assert result.updated is True
        assert path.task_type == "code_review"
        recommended = optimizer.get_recommended_path("code_review")
        assert recommended is not None
--- a/tests/unit/evolution/test_pitfall_detector.py
+++ b/tests/unit/evolution/test_pitfall_detector.py
@ -0,0 +1,595 @@
 """Tests for PitfallDetector - 任务避坑预警检测"""
 from __future__ import annotations
 from datetime import datetime, timezone
 import pytest
 from agentkit.core.plan_schema import PlanStep, PlanStepStatus
 from agentkit.evolution.experience_schema import TaskExperience
 from agentkit.evolution.experience_store import InMemoryExperienceStore
 from agentkit.evolution.pitfall_detector import (
    PitfallDetector,
    PitfallWarning,
    WarningLevel,
    _compute_name_similarity,
    _determine_warning_level,
    _extract_keywords,
 )
 # ── Fixtures ──────────────────────────────────────────────
@pytest.fixture
 def store():
    """无 embedder 的 InMemoryExperienceStore"""
    return InMemoryExperienceStore(decay_rate=0.01, alpha=0.7)
@pytest.fixture
 def detector(store):
    """基于 InMemoryExperienceStore 的 PitfallDetector"""
    return PitfallDetector(experience_store=store, similarity_threshold=0.3)
 def _make_experience(
    task_type: str = "code_review",
    goal: str = "Review the PR",
    outcome: str = "success",
    steps_summary: str | list[dict] = "",
    failure_reasons: list[str] | None = None,
    optimization_tips: list[str] | None = None,
    success_rate: float = 1.0,
 ) -> TaskExperience:
    """创建测试用 TaskExperience"""
    return TaskExperience(
        experience_id="",
        task_type=task_type,
        goal=goal,
        steps_summary=steps_summary,
        outcome=outcome,
        duration_seconds=10.0,
        success_rate=success_rate,
        failure_reasons=failure_reasons or [],
        optimization_tips=optimization_tips or [],
        created_at=datetime.now(timezone.utc),
    )
 def _make_step(
    name: str = "step",
    description: str = "do something",
    step_id: str = "s1",
 ) -> PlanStep:
    """创建测试用 PlanStep"""
    return PlanStep(
        step_id=step_id,
        name=name,
        description=description,
        status=PlanStepStatus.PENDING,
    )
 # ── 辅助函数测试 ──────────────────────────────────────────
 class TestExtractKeywords:
    def test_basic_extraction(self):
        keywords = _extract_keywords("Call API Gateway")
        assert "call" in keywords
        assert "api" in keywords
        assert "gateway" in keywords
    def test_stop_words_filtered(self):
        keywords = _extract_keywords("Call the API and check the result")
        assert "the" not in keywords
        assert "and" not in keywords
        assert "call" in keywords
        assert "api" in keywords
    def test_underscore_and_hyphen(self):
        keywords = _extract_keywords("call_api-gateway")
        assert "call" in keywords
        assert "api" in keywords
        assert "gateway" in keywords
    def test_single_char_filtered(self):
        keywords = _extract_keywords("a b cd")
        assert "a" not in keywords
        assert "b" not in keywords
        assert "cd" in keywords
    def test_empty_string(self):
        keywords = _extract_keywords("")
        assert len(keywords) == 0
 class TestComputeNameSimilarity:
    def test_identical_names(self):
        sim = _compute_name_similarity("Call API Gateway", "", "Call API Gateway")
        assert sim == pytest.approx(1.0)
    def test_partial_overlap(self):
        sim = _compute_name_similarity("Call API Gateway", "", "Call External API")
        # 共享: call, api; 并集: call, api, gateway, external
        assert 0.0 < sim < 1.0
    def test_no_overlap(self):
        sim = _compute_name_similarity("Deploy Service", "", "Analyze Data")
        assert sim == 0.0
    def test_description_contributes(self):
        sim_no_desc = _compute_name_similarity("Deploy", "", "Deploy Service")
        sim_with_desc = _compute_name_similarity("Deploy", "Deploy Service", "Deploy Service")
        # description 中包含匹配关键词，应提高相似度
        assert sim_with_desc >= sim_no_desc
    def test_empty_inputs(self):
        sim = _compute_name_similarity("", "", "Call API")
        assert sim == 0.0
 class TestDetermineWarningLevel:
    def test_high_threshold(self):
        assert _determine_warning_level(0.6) == WarningLevel.HIGH
        assert _determine_warning_level(0.5) == WarningLevel.HIGH
    def test_medium_threshold(self):
        assert _determine_warning_level(0.3) == WarningLevel.MEDIUM
        assert _determine_warning_level(0.2) == WarningLevel.MEDIUM
    def test_low_threshold(self):
        assert _determine_warning_level(0.1) == WarningLevel.LOW
        assert _determine_warning_level(0.01) == WarningLevel.LOW
 # ── PitfallDetector.check_pitfalls 测试 ──────────────────
 class TestCheckPitfalls:
    async def test_no_planned_steps_returns_empty(self, detector):
        warnings = await detector.check_pitfalls(task_type="code_review", planned_steps=[])
        assert warnings == []
    async def test_no_failed_experiences_returns_empty(self, detector, store):
        """无历史失败记录 → 返回空列表"""
        # 只记录成功经验
        await store.record_experience(
            _make_experience(task_type="code_review", outcome="success")
        )
        steps = [_make_step(name="Review Code")]
        warnings = await detector.check_pitfalls(task_type="code_review", planned_steps=steps)
        assert warnings == []
    async def test_high_failure_rate_returns_high_warning(self, detector, store):
        """计划包含历史高失败率步骤 → 返回 HIGH 级别预警"""
        # 记录多次失败经验，其中 "Call API Gateway" 步骤失败率高
        for _ in range(6):
            await store.record_experience(
                _make_experience(
                    task_type="deployment",
                    outcome="failure",
                    success_rate=0.0,
                    steps_summary=[
                        {"step_name": "Call API Gateway", "outcome": "failure", "error": "Timeout"},
                        {"step_name": "Deploy Container", "outcome": "success"},
                    ],
                    failure_reasons=["API Gateway timeout"],
                )
            )
        # 记录少数成功经验
        for _ in range(4):
            await store.record_experience(
                _make_experience(
                    task_type="deployment",
                    outcome="success",
                    success_rate=1.0,
                    steps_summary=[
                        {"step_name": "Call API Gateway", "outcome": "success"},
                        {"step_name": "Deploy Container", "outcome": "success"},
                    ],
                )
            )
        steps = [_make_step(name="Call API Gateway", description="Invoke API Gateway endpoint")]
        warnings = await detector.check_pitfalls(task_type="deployment", planned_steps=steps)
        assert len(warnings) == 1
        warning = warnings[0]
        assert warning.step_name == "Call API Gateway"
        assert warning.warning_level == WarningLevel.HIGH
        assert warning.failure_rate >= 0.5
        assert "Timeout" in warning.historical_failures
    async def test_medium_failure_rate(self, detector, store):
        """中等失败率 → MEDIUM 级别预警"""
        # 3 次失败，7 次成功 → 失败率 0.3
        for _ in range(3):
            await store.record_experience(
                _make_experience(
                    task_type="data_analysis",
                    outcome="failure",
                    success_rate=0.0,
                    steps_summary=[
                        {"step_name": "Fetch Data", "outcome": "failure", "error": "Connection refused"},
                    ],
                )
            )
        for _ in range(7):
            await store.record_experience(
                _make_experience(
                    task_type="data_analysis",
                    outcome="success",
                    success_rate=1.0,
                    steps_summary=[
                        {"step_name": "Fetch Data", "outcome": "success"},
                    ],
                )
            )
        steps = [_make_step(name="Fetch Data", description="Fetch data from source")]
        warnings = await detector.check_pitfalls(task_type="data_analysis", planned_steps=steps)
        assert len(warnings) == 1
        assert warnings[0].warning_level == WarningLevel.MEDIUM
        assert 0.2 <= warnings[0].failure_rate < 0.5
    async def test_low_failure_rate(self, detector, store):
        """低失败率 → LOW 级别预警"""
        # 1 次失败，9 次成功 → 失败率 0.1
        await store.record_experience(
            _make_experience(
                task_type="testing",
                outcome="failure",
                success_rate=0.0,
                steps_summary=[
                    {"step_name": "Run Unit Tests", "outcome": "failure", "error": "Flaky test"},
                ],
            )
        )
        for _ in range(9):
            await store.record_experience(
                _make_experience(
                    task_type="testing",
                    outcome="success",
                    success_rate=1.0,
                    steps_summary=[
                        {"step_name": "Run Unit Tests", "outcome": "success"},
                    ],
                )
            )
        steps = [_make_step(name="Run Unit Tests", description="Execute unit test suite")]
        warnings = await detector.check_pitfalls(task_type="testing", planned_steps=steps)
        assert len(warnings) == 1
        assert warnings[0].warning_level == WarningLevel.LOW
    async def test_multiple_steps_with_risks_sorted_by_severity(self, detector, store):
        """多个步骤有风险 → 按严重程度排序返回"""
        # "Call API" 高失败率，"Validate Input" 低失败率
        for _ in range(6):
            await store.record_experience(
                _make_experience(
                    task_type="integration",
                    outcome="failure",
                    success_rate=0.0,
                    steps_summary=[
                        {"step_name": "Call API", "outcome": "failure", "error": "Timeout"},
                        {"step_name": "Validate Input", "outcome": "success"},
                    ],
                )
            )
        for _ in range(4):
            await store.record_experience(
                _make_experience(
                    task_type="integration",
                    outcome="success",
                    success_rate=1.0,
                    steps_summary=[
                        {"step_name": "Call API", "outcome": "success"},
                        {"step_name": "Validate Input", "outcome": "success"},
                    ],
                )
            )
        # 单独给 Validate Input 加一条失败记录
        await store.record_experience(
            _make_experience(
                task_type="integration",
                outcome="partial",
                success_rate=0.5,
                steps_summary=[
                    {"step_name": "Call API", "outcome": "success"},
                    {"step_name": "Validate Input", "outcome": "failure", "error": "Invalid schema"},
                ],
            )
        )
        steps = [
            _make_step(name="Validate Input", description="Validate input data", step_id="s1"),
            _make_step(name="Call API", description="Call external API", step_id="s2"),
        ]
        warnings = await detector.check_pitfalls(task_type="integration", planned_steps=steps)
        assert len(warnings) == 2
        # HIGH 应排在 MEDIUM/LOW 之前
        assert warnings[0].warning_level == WarningLevel.HIGH
        assert warnings[0].step_name == "Call API"
    async def test_no_matching_steps_returns_empty(self, detector, store):
        """计划步骤与历史失败步骤无匹配 → 返回空列表"""
        await store.record_experience(
            _make_experience(
                task_type="code_review",
                outcome="failure",
                success_rate=0.0,
                steps_summary=[
                    {"step_name": "Run Linter", "outcome": "failure", "error": "Config error"},
                ],
            )
        )
        # 计划步骤名称与历史步骤完全不同
        steps = [_make_step(name="Deploy Application", description="Deploy to production")]
        warnings = await detector.check_pitfalls(task_type="code_review", planned_steps=steps)
        assert warnings == []
    async def test_different_task_type_no_cross_contamination(self, detector, store):
        """不同 task_type 的失败经验不会跨类型预警"""
        await store.record_experience(
            _make_experience(
                task_type="deployment",
                outcome="failure",
                success_rate=0.0,
                steps_summary=[
                    {"step_name": "Deploy Service", "outcome": "failure", "error": "OOM"},
                ],
            )
        )
        # 查询 code_review 类型，不应返回 deployment 的失败经验
        steps = [_make_step(name="Deploy Service", description="Deploy the service")]
        warnings = await detector.check_pitfalls(task_type="code_review", planned_steps=steps)
        assert warnings == []
    async def test_partial_outcome_included(self, detector, store):
        """partial 结果的经验也应被检索"""
        await store.record_experience(
            _make_experience(
                task_type="migration",
                outcome="partial",
                success_rate=0.5,
                steps_summary=[
                    {"step_name": "Migrate Database", "outcome": "failure", "error": "Schema mismatch"},
                ],
            )
        )
        steps = [_make_step(name="Migrate Database", description="Migrate DB schema")]
        warnings = await detector.check_pitfalls(task_type="migration", planned_steps=steps)
        assert len(warnings) == 1
    async def test_steps_summary_as_string_ignored(self, detector, store):
        """steps_summary 为字符串时无法提取步骤级信息，不产生预警"""
        await store.record_experience(
            _make_experience(
                task_type="code_review",
                outcome="failure",
                success_rate=0.0,
                steps_summary="Executed code_review task",  # 字符串格式
            )
        )
        steps = [_make_step(name="Review Code", description="Review the code")]
        warnings = await detector.check_pitfalls(task_type="code_review", planned_steps=steps)
        assert warnings == []
 # ── AE3 场景测试 ──────────────────────────────────────────
 class TestAE3Scenario:
    """AE3: "调用 X 系统 API 在高峰期超时率 60%" → 新任务调用时自动预警"""
    async def test_api_timeout_high_failure_rate_warning(self, detector, store):
        """调用 X 系统 API 在高峰期超时率 60% → 新任务调用时自动预警"""
        # 模拟历史：10 次调用，6 次超时 → 60% 失败率
        for _ in range(6):
            await store.record_experience(
                _make_experience(
                    task_type="order_processing",
                    goal="Process orders via X system",
                    outcome="failure",
                    success_rate=0.0,
                    steps_summary=[
                        {"step_name": "Call X System API", "outcome": "failure", "error": "高峰期超时"},
                        {"step_name": "Process Order", "outcome": "success"},
                    ],
                    failure_reasons=["X System API timeout during peak hours"],
                    optimization_tips=["Avoid peak hours", "Add retry logic"],
                )
            )
        for _ in range(4):
            await store.record_experience(
                _make_experience(
                    task_type="order_processing",
                    goal="Process orders via X system",
                    outcome="success",
                    success_rate=1.0,
                    steps_summary=[
                        {"step_name": "Call X System API", "outcome": "success"},
                        {"step_name": "Process Order", "outcome": "success"},
                    ],
                )
            )
        # 新任务计划包含调用 X 系统 API
        steps = [
            _make_step(name="Call X System API", description="Invoke X system API for orders"),
        ]
        warnings = await detector.check_pitfalls(task_type="order_processing", planned_steps=steps)
        assert len(warnings) == 1
        warning = warnings[0]
        assert warning.warning_level == WarningLevel.HIGH
        assert warning.failure_rate >= 0.5
        assert any("超时" in reason for reason in warning.historical_failures)
        assert warning.suggestion  # 应有建议
 # ── PitfallWarning 数据模型测试 ───────────────────────────
 class TestPitfallWarning:
    def test_creation(self):
        warning = PitfallWarning(
            step_name="Call API",
            warning_level=WarningLevel.HIGH,
            failure_rate=0.6,
            historical_failures=["Timeout", "Connection refused"],
            suggestion="Add retry logic",
        )
        assert warning.step_name == "Call API"
        assert warning.warning_level == WarningLevel.HIGH
        assert warning.failure_rate == 0.6
        assert warning.historical_failures == ["Timeout", "Connection refused"]
        assert warning.suggestion == "Add retry logic"
    def test_default_values(self):
        warning = PitfallWarning(
            step_name="Test",
            warning_level=WarningLevel.LOW,
            failure_rate=0.1,
        )
        assert warning.historical_failures == []
        assert warning.suggestion == ""
 # ── WarningLevel 枚举测试 ─────────────────────────────────
 class TestWarningLevel:
    def test_values(self):
        assert WarningLevel.HIGH.value == "high"
        assert WarningLevel.MEDIUM.value == "medium"
        assert WarningLevel.LOW.value == "low"
    def test_string_comparison(self):
        assert WarningLevel.HIGH == "high"
        assert WarningLevel.MEDIUM == "medium"
        assert WarningLevel.LOW == "low"
 # ── 相似度阈值配置测试 ─────────────────────────────────────
 class TestSimilarityThreshold:
    async def test_custom_threshold(self, store):
        """自定义相似度阈值"""
        # 低阈值：更容易匹配
        detector_low = PitfallDetector(experience_store=store, similarity_threshold=0.1)
        # 高阈值：更难匹配
        detector_high = PitfallDetector(experience_store=store, similarity_threshold=0.8)
        await store.record_experience(
            _make_experience(
                task_type="testing",
                outcome="failure",
                success_rate=0.0,
                steps_summary=[
                    {"step_name": "Run Integration Tests", "outcome": "failure", "error": "Timeout"},
                ],
            )
        )
        steps = [_make_step(name="Run Unit Tests", description="Execute tests")]
        # 低阈值可能匹配，高阈值可能不匹配
        warnings_low = await detector_low.check_pitfalls(task_type="testing", planned_steps=steps)
        warnings_high = await detector_high.check_pitfalls(task_type="testing", planned_steps=steps)
        # 低阈值匹配数 >= 高阈值匹配数
        assert len(warnings_low) >= len(warnings_high)
 # ── 端到端流程测试 ─────────────────────────────────────────
 class TestEndToEnd:
    async def test_full_pitfall_detection_flow(self, detector, store):
        """完整的避坑检测流程"""
        # 1. 记录多种失败经验
        await store.record_experience(
            _make_experience(
                task_type="deployment",
                outcome="failure",
                success_rate=0.0,
                steps_summary=[
                    {"step_name": "Build Docker Image", "outcome": "failure", "error": "OOM"},
                    {"step_name": "Push to Registry", "outcome": "success"},
                ],
                failure_reasons=["Docker build OOM"],
                optimization_tips=["Increase memory limit"],
            )
        )
        await store.record_experience(
            _make_experience(
                task_type="deployment",
                outcome="failure",
                success_rate=0.0,
                steps_summary=[
                    {"step_name": "Build Docker Image", "outcome": "failure", "error": "Dependency conflict"},
                    {"step_name": "Push to Registry", "outcome": "success"},
                ],
                failure_reasons=["Dependency conflict"],
            )
        )
        await store.record_experience(
            _make_experience(
                task_type="deployment",
                outcome="success",
                success_rate=1.0,
                steps_summary=[
                    {"step_name": "Build Docker Image", "outcome": "success"},
                    {"step_name": "Push to Registry", "outcome": "success"},
                ],
            )
        )
        # 2. 新任务计划
        steps = [
            _make_step(name="Build Docker Image", description="Build the container image", step_id="s1"),
            _make_step(name="Push to Registry", description="Push image to container registry", step_id="s2"),
        ]
        # 3. 检测避坑
        warnings = await detector.check_pitfalls(task_type="deployment", planned_steps=steps)
        # 4. 验证结果
        assert len(warnings) >= 1
        # Build Docker Image 失败率 2/3 ≈ 0.667，应为 HIGH
        build_warning = next((w for w in warnings if w.step_name == "Build Docker Image"), None)
        assert build_warning is not None
        assert build_warning.warning_level == WarningLevel.HIGH
        assert build_warning.failure_rate == pytest.approx(2.0 / 3.0, abs=0.01)
    async def test_suggestion_contains_useful_info(self, detector, store):
        """预警建议应包含有用的失败原因和优化建议"""
        await store.record_experience(
            _make_experience(
                task_type="api_integration",
                outcome="failure",
                success_rate=0.0,
                steps_summary=[
                    {"step_name": "Authenticate", "outcome": "failure", "error": "Token expired"},
                ],
                failure_reasons=["Token expired"],
                optimization_tips=["Refresh token before expiry"],
            )
        )
        steps = [_make_step(name="Authenticate", description="Authenticate with API")]
        warnings = await detector.check_pitfalls(task_type="api_integration", planned_steps=steps)
        assert len(warnings) == 1
        assert "Token expired" in warnings[0].suggestion
--- a/tests/unit/tools/test_pty_session.py
+++ b/tests/unit/tools/test_pty_session.py
@ -0,0 +1,217 @@
 """PTYSession 单元测试
 测试场景：
 - PTY 会话启动和关闭
 - 交互式命令执行
 - 自动应答提示
 - 超时处理
 - 自定义应答规则
 """
 from __future__ import annotations
 import asyncio
 from unittest.mock import AsyncMock, MagicMock, patch
 import pytest
 from agentkit.tools.pty_session import PTYSession, PTYOutput
 from agentkit.tools.shell import ShellTool
 class TestPTYSessionConstruction:
    """测试 PTYSession 构造"""
    def test_default_construction(self):
        pty = PTYSession()
        assert pty.is_running is False
        assert pty._auto_respond is True
        assert pty._default_timeout == 30.0
    def test_custom_construction(self):
        pty = PTYSession(
            auto_respond=False,
            custom_rules=[(r"confirm\?", "yes")],
            default_timeout=60.0,
        )
        assert pty._auto_respond is False
        assert len(pty._respond_rules) > len(pty._respond_rules) - 1
        assert pty._default_timeout == 60.0
 class TestPTYSessionLifecycle:
    """测试 PTYSession 生命周期"""
    @pytest.mark.asyncio
    async def test_start_and_close(self):
        """启动和关闭 PTY 会话"""
        pty = PTYSession()
        await pty.start()
        assert pty.is_running is True
        await pty.close()
        assert pty.is_running is False
    @pytest.mark.asyncio
    async def test_start_idempotent(self):
        """重复启动不报错"""
        pty = PTYSession()
        await pty.start()
        await pty.start()  # 不应抛出异常
        assert pty.is_running is True
        await pty.close()
    @pytest.mark.asyncio
    async def test_close_without_start(self):
        """未启动时关闭不报错"""
        pty = PTYSession()
        await pty.close()  # 不应抛出异常
 class TestPTYSessionExecution:
    """测试 PTYSession 命令执行"""
    @pytest.mark.asyncio
    async def test_run_simple_command(self):
        """执行简单命令"""
        pty = PTYSession(default_timeout=10.0)
        try:
            await pty.start()
            result = await pty.run_command("echo hello_pty")
            assert "hello_pty" in result.output
            assert result.exit_code == 0
            assert result.timed_out is False
        finally:
            await pty.close()
    @pytest.mark.asyncio
    async def test_run_command_with_cwd(self):
        """指定工作目录执行命令"""
        pty = PTYSession(default_timeout=10.0)
        try:
            await pty.start()
            result = await pty.run_command("pwd", cwd="/tmp")
            assert "/tmp" in result.output
        finally:
            await pty.close()
    @pytest.mark.asyncio
    async def test_run_command_with_env(self):
        """指定环境变量执行命令"""
        pty = PTYSession(default_timeout=10.0)
        try:
            await pty.start()
            result = await pty.run_command(
                "echo $PTY_TEST_VAR",
                env={"PTY_TEST_VAR": "pty_value"},
            )
            assert "pty_value" in result.output
        finally:
            await pty.close()
    @pytest.mark.asyncio
    async def test_run_failing_command(self):
        """执行失败命令"""
        pty = PTYSession(default_timeout=10.0)
        try:
            await pty.start()
            result = await pty.run_command("ls /nonexistent_dir_xyz_12345")
            assert result.exit_code != 0
        finally:
            await pty.close()
    @pytest.mark.asyncio
    async def test_run_command_timeout(self):
        """命令超时"""
        pty = PTYSession(default_timeout=10.0)
        try:
            await pty.start()
            result = await pty.run_command("sleep 30", timeout=0.5)
            assert result.timed_out is True
            assert result.exit_code == -1
        finally:
            await pty.close()
 class TestPTYSessionAutoRespond:
    """测试 PTYSession 自动应答"""
    @pytest.mark.asyncio
    async def test_auto_respond_yes_no(self):
        """自动应答 [y/N] 提示"""
        # 使用 echo 模拟包含提示的输出，然后验证自动应答规则存在
        pty = PTYSession(auto_respond=True)
        # 验证规则已加载
        rule_patterns = [r[0] for r in pty._respond_rules]
        assert any("y/N" in p or "Y/n" in p for p in rule_patterns)
    @pytest.mark.asyncio
    async def test_auto_respond_disabled(self):
        """禁用自动应答"""
        pty = PTYSession(auto_respond=False)
        assert pty._auto_respond is False
    @pytest.mark.asyncio
    async def test_custom_respond_rules(self):
        """自定义应答规则"""
        pty = PTYSession(
            auto_respond=True,
            custom_rules=[(r"continue\?\s*$", "yes")],
        )
        rule_patterns = [r[0] for r in pty._respond_rules]
        assert r"continue\?\s*$" in rule_patterns
 class TestPTYSessionSendAndRead:
    """测试 PTYSession 发送和读取"""
    @pytest.mark.asyncio
    async def test_send_without_start(self):
        """未启动时发送不报错"""
        pty = PTYSession()
        await pty.send("test")  # 不应抛出异常
    @pytest.mark.asyncio
    async def test_read_output_without_start(self):
        """未启动时读取返回空"""
        pty = PTYSession()
        output = await pty.read_output()
        assert output == ""
 class TestPTYOutput:
    """测试 PTYOutput 数据类"""
    def test_default_values(self):
        output = PTYOutput(output="test")
        assert output.output == "test"
        assert output.exit_code == -1
        assert output.timed_out is False
    def test_custom_values(self):
        output = PTYOutput(output="error", exit_code=1, timed_out=True)
        assert output.exit_code == 1
        assert output.timed_out is True
 class TestShellToolInteractiveMode:
    """测试 ShellTool 交互式模式"""
    @pytest.mark.asyncio
    async def test_interactive_mode(self):
        """ShellTool interactive 模式执行命令"""
        tool = ShellTool()
        result = await tool.execute(command="echo interactive_test", interactive=True)
        assert result["exit_code"] == 0
        assert "interactive_test" in result["output"]
    @pytest.mark.asyncio
    async def test_interactive_mode_with_session(self):
        """ShellTool 会话模式 + 交互式"""
        tool = ShellTool()
        result = await tool.execute(
            command="echo session_interactive",
            session_id="int-session",
            interactive=True,
        )
        assert result["exit_code"] == 0
        assert "session_interactive" in result["output"]
--- a/tests/unit/tools/test_terminal_session.py
+++ b/tests/unit/tools/test_terminal_session.py
@ -0,0 +1,600 @@
 """TerminalSession 和 ShellTool 单元测试
 测试场景：
 - 跨命令保持 cwd → cd 后执行 pwd 返回正确目录
 - 跨命令保持 env → export 后执行 echo 返回正确值
 - 危险命令需确认 → rm 命令触发确认回调
 - 输出解析 → 错误输出结构化为错误类型+建议
 - 无 session_id 时保持现有行为
 - 会话管理器功能
 """
 from __future__ import annotations
 import os
 from unittest.mock import AsyncMock, MagicMock, patch
 import pytest
 from agentkit.tools.terminal_session import TerminalSession, TerminalSessionManager, CommandRecord
 from agentkit.tools.shell import ShellTool
 from agentkit.tools.output_parser import OutputParser, ParsedOutput, ErrorType
 # ============================================================
 # OutputParser 测试
 # ============================================================
 class TestOutputParser:
    """测试 OutputParser 结构化解析"""
    def setup_method(self):
        self.parser = OutputParser()
    def test_parse_success_output(self):
        """成功输出解析"""
        result = self.parser.parse("hello world", 0)
        assert result.exit_code == 0
        assert result.is_error is False
        assert result.error_type == ErrorType.NONE
        assert result.message == "hello world"
        assert result.suggestions == []
    def test_parse_empty_output(self):
        """空输出解析"""
        result = self.parser.parse("", 0)
        assert result.exit_code == 0
        assert result.is_error is False
        assert result.message == ""
    def test_parse_permission_denied(self):
        """权限不足错误解析"""
        result = self.parser.parse("permission denied: /root/secret", 1)
        assert result.is_error is True
        assert result.error_type == ErrorType.PERMISSION_DENIED
        assert len(result.suggestions) > 0
        assert any("sudo" in s for s in result.suggestions)
    def test_parse_not_found(self):
        """文件不存在错误解析"""
        result = self.parser.parse("No such file or directory: /tmp/missing", 1)
        assert result.is_error is True
        assert result.error_type == ErrorType.NOT_FOUND
    def test_parse_timeout(self):
        """超时错误解析"""
        result = self.parser.parse("Connection timed out", 1)
        assert result.is_error is True
        assert result.error_type == ErrorType.TIMEOUT
    def test_parse_syntax_error(self):
        """语法错误解析"""
        result = self.parser.parse("syntax error near unexpected token", 2)
        assert result.is_error is True
        assert result.error_type == ErrorType.SYNTAX_ERROR
    def test_parse_connection_refused(self):
        """连接被拒绝解析"""
        result = self.parser.parse("Connection refused on port 8080", 1)
        assert result.is_error is True
        assert result.error_type == ErrorType.CONNECTION_REFUSED
    def test_parse_out_of_memory(self):
        """内存不足解析"""
        result = self.parser.parse("Out of memory: cannot allocate", 1)
        assert result.is_error is True
        assert result.error_type == ErrorType.OUT_OF_MEMORY
    def test_parse_disk_full(self):
        """磁盘满解析"""
        result = self.parser.parse("No space left on device", 1)
        assert result.is_error is True
        assert result.error_type == ErrorType.DISK_FULL
    def test_parse_already_exists(self):
        """已存在解析"""
        result = self.parser.parse("File already exists: /tmp/test", 1)
        assert result.is_error is True
        assert result.error_type == ErrorType.ALREADY_EXISTS
    def test_parse_invalid_argument(self):
        """无效参数解析"""
        result = self.parser.parse("invalid argument: --unknown-flag", 1)
        assert result.is_error is True
        assert result.error_type == ErrorType.INVALID_ARGUMENT
    def test_parse_network_error(self):
        """网络错误解析"""
        result = self.parser.parse("Network is unreachable", 1)
        assert result.is_error is True
        assert result.error_type == ErrorType.NETWORK_ERROR
    def test_parse_exit_code_126(self):
        """退出码 126 → 权限不足"""
        result = self.parser.parse("some unknown error", 126)
        assert result.is_error is True
        assert result.error_type == ErrorType.PERMISSION_DENIED
    def test_parse_exit_code_127(self):
        """退出码 127 → 命令未找到"""
        result = self.parser.parse("some unknown error", 127)
        assert result.is_error is True
        assert result.error_type == ErrorType.NOT_FOUND
    def test_parse_exit_code_130(self):
        """退出码 130 → 被中断"""
        result = self.parser.parse("some unknown error", 130)
        assert result.is_error is True
        assert result.error_type == ErrorType.TIMEOUT
    def test_parse_unknown_error(self):
        """未知错误"""
        result = self.parser.parse("something went wrong", 1)
        assert result.is_error is True
        assert result.error_type == ErrorType.UNKNOWN
    def test_parse_long_message_truncated(self):
        """长消息截断"""
        long_output = "x" * 300
        result = self.parser.parse(long_output, 0)
        assert len(result.message) <= 203  # 200 + "..."
    def test_parsed_output_to_dict(self):
        """ParsedOutput.to_dict()"""
        result = self.parser.parse("permission denied", 1)
        d = result.to_dict()
        assert d["exit_code"] == 1
        assert d["is_error"] is True
        assert d["error_type"] == "permission_denied"
        assert isinstance(d["suggestions"], list)
    def test_parse_chinese_error_messages(self):
        """中文错误消息解析"""
        result = self.parser.parse("权限不足: 无法访问", 1)
        assert result.is_error is True
        assert result.error_type == ErrorType.PERMISSION_DENIED
    def test_parse_multiline_output_message_is_last_line(self):
        """多行输出取最后一行作为消息"""
        output = "line1\nline2\nline3"
        result = self.parser.parse(output, 0)
        assert result.message == "line3"
 # ============================================================
 # TerminalSession 测试
 # ============================================================
 class TestTerminalSession:
    """测试 TerminalSession 会话状态管理"""
    def test_construction_default(self):
        """默认构造"""
        session = TerminalSession(session_id="test")
        assert session.session_id == "test"
        assert session.cwd == os.getcwd()
        assert isinstance(session.env, dict)
        assert session.history == []
    def test_construction_custom_cwd(self):
        """自定义工作目录"""
        session = TerminalSession(session_id="test", cwd="/tmp")
        assert session.cwd == "/tmp"
    def test_construction_custom_env(self):
        """自定义环境变量"""
        session = TerminalSession(session_id="test", env={"FOO": "bar"})
        assert session.env.get("FOO") == "bar"
    def test_set_cwd(self):
        """手动设置 cwd"""
        session = TerminalSession(session_id="test")
        session.set_cwd("/usr/local")
        assert session.cwd == "/usr/local"
    def test_set_env(self):
        """手动设置环境变量"""
        session = TerminalSession(session_id="test")
        session.set_env("MY_VAR", "hello")
        assert session.env.get("MY_VAR") == "hello"
    def test_update_env(self):
        """批量更新环境变量"""
        session = TerminalSession(session_id="test")
        session.update_env({"A": "1", "B": "2"})
        assert session.env.get("A") == "1"
        assert session.env.get("B") == "2"
    def test_get_env_returns_copy(self):
        """get_env 返回副本，修改不影响原数据"""
        session = TerminalSession(session_id="test")
        env = session.get_env()
        env["HACKED"] = "yes"
        assert "HACKED" not in session.env
    def test_get_history_returns_copy(self):
        """get_history 返回副本"""
        session = TerminalSession(session_id="test")
        history = session.get_history()
        assert history is not session._history
    @pytest.mark.asyncio
    async def test_execute_simple_command(self):
        """执行简单命令"""
        session = TerminalSession(session_id="test")
        result = await session.execute("echo hello")
        assert result.exit_code == 0
        assert "hello" in result.raw_output
    @pytest.mark.asyncio
    async def test_execute_records_history(self):
        """执行命令记录历史"""
        session = TerminalSession(session_id="test")
        await session.execute("echo first")
        await session.execute("echo second")
        assert len(session.history) == 2
        assert session.history[0].command == "echo first"
        assert session.history[1].command == "echo second"
    @pytest.mark.asyncio
    async def test_cross_command_cwd(self):
        """跨命令保持 cwd：cd 后 pwd 返回正确目录"""
        session = TerminalSession(session_id="test")
        await session.execute("cd /tmp")
        assert session.cwd == "/tmp"
        result = await session.execute("pwd")
        assert "/tmp" in result.raw_output
    @pytest.mark.asyncio
    async def test_cross_command_env(self):
        """跨命令保持 env：export 后 echo 返回正确值"""
        session = TerminalSession(session_id="test")
        await session.execute("export MY_TEST_VAR=hello123")
        assert session.env.get("MY_TEST_VAR") == "hello123"
        result = await session.execute("echo $MY_TEST_VAR")
        assert "hello123" in result.raw_output
    @pytest.mark.asyncio
    async def test_cd_relative_path(self):
        """cd 相对路径（目录存在时更新 cwd）"""
        # 使用 /usr 作为基础目录，cd local（/usr/local 存在）
        session = TerminalSession(session_id="test", cwd="/usr")
        await session.execute("cd local")
        assert session.cwd == "/usr/local"
    @pytest.mark.asyncio
    async def test_cd_absolute_path(self):
        """cd 绝对路径"""
        session = TerminalSession(session_id="test")
        await session.execute("cd /usr")
        assert session.cwd == "/usr"
    @pytest.mark.asyncio
    async def test_failed_command_no_state_update(self):
        """失败命令不更新状态"""
        session = TerminalSession(session_id="test", cwd="/tmp")
        await session.execute("cd /nonexistent_dir_xyz")
        # cd 失败，cwd 不应更新
        assert session.cwd == "/tmp"
    @pytest.mark.asyncio
    async def test_timeout(self):
        """命令超时"""
        session = TerminalSession(session_id="test")
        result = await session.execute("sleep 10", timeout=0.5)
        assert result.exit_code == -1
        assert result.is_error is True
    @pytest.mark.asyncio
    async def test_max_history(self):
        """历史记录上限"""
        session = TerminalSession(session_id="test", max_history=3)
        for i in range(5):
            await session.execute(f"echo {i}")
        assert len(session.history) == 3
        assert session.history[0].command == "echo 2"
    def test_close(self):
        """关闭会话"""
        session = TerminalSession(session_id="test")
        session.close()  # 不应抛出异常
 # ============================================================
 # TerminalSessionManager 测试
 # ============================================================
 class TestTerminalSessionManager:
    """测试 TerminalSessionManager 会话管理"""
    def test_get_or_create_new(self):
        """创建新会话"""
        manager = TerminalSessionManager()
        session = manager.get_or_create("s1")
        assert session.session_id == "s1"
    def test_get_or_create_existing(self):
        """获取已有会话"""
        manager = TerminalSessionManager()
        s1 = manager.get_or_create("s1")
        s1.set_cwd("/tmp")
        s2 = manager.get_or_create("s1")
        assert s2.cwd == "/tmp"
    def test_get_existing(self):
        """get 获取已有会话"""
        manager = TerminalSessionManager()
        manager.get_or_create("s1")
        session = manager.get("s1")
        assert session is not None
    def test_get_nonexistent(self):
        """get 不存在的会话返回 None"""
        manager = TerminalSessionManager()
        assert manager.get("nonexistent") is None
    def test_remove(self):
        """移除会话"""
        manager = TerminalSessionManager()
        manager.get_or_create("s1")
        manager.remove("s1")
        assert manager.get("s1") is None
    def test_list_sessions(self):
        """列出会话"""
        manager = TerminalSessionManager()
        manager.get_or_create("s1")
        manager.get_or_create("s2")
        assert sorted(manager.list_sessions()) == ["s1", "s2"]
    def test_has_session(self):
        """检查会话是否存在"""
        manager = TerminalSessionManager()
        manager.get_or_create("s1")
        assert manager.has_session("s1") is True
        assert manager.has_session("s2") is False
    def test_max_sessions_eviction(self):
        """超过最大会话数时移除最旧会话"""
        manager = TerminalSessionManager(max_sessions=2)
        manager.get_or_create("s1")
        manager.get_or_create("s2")
        manager.get_or_create("s3")  # 应该移除 s1
        assert not manager.has_session("s1")
        assert manager.has_session("s2")
        assert manager.has_session("s3")
    def test_close_all(self):
        """关闭所有会话"""
        manager = TerminalSessionManager()
        manager.get_or_create("s1")
        manager.get_or_create("s2")
        manager.close_all()
        assert manager.list_sessions() == []
 # ============================================================
 # ShellTool 测试
 # ============================================================
 class TestShellToolConstruction:
    """测试 ShellTool 构造"""
    def test_default_construction(self):
        tool = ShellTool()
        assert tool.name == "shell"
        assert tool.input_schema is not None
        assert "command" in tool.input_schema["properties"]
        assert "session_id" in tool.input_schema["properties"]
        assert tool.input_schema["required"] == ["command"]
    def test_custom_construction(self):
        tool = ShellTool(name="my_shell", version="2.0.0")
        assert tool.name == "my_shell"
        assert tool.version == "2.0.0"
    def test_to_dict(self):
        tool = ShellTool()
        d = tool.to_dict()
        assert d["name"] == "shell"
        assert "input_schema" in d
    def test_repr(self):
        tool = ShellTool()
        r = repr(tool)
        assert "ShellTool" in r
        assert "shell" in r
 class TestShellToolExecution:
    """测试 ShellTool 命令执行"""
    @pytest.mark.asyncio
    async def test_execute_simple_command(self):
        """执行简单命令（无会话模式）"""
        tool = ShellTool()
        result = await tool.execute(command="echo hello")
        assert result["exit_code"] == 0
        assert "hello" in result["output"]
        assert result["is_error"] is False
        assert result["session_id"] is None
    @pytest.mark.asyncio
    async def test_execute_missing_command(self):
        """缺少 command 参数"""
        tool = ShellTool()
        result = await tool.execute()
        assert result["is_error"] is True
        assert result["exit_code"] == 1
    @pytest.mark.asyncio
    async def test_execute_with_working_dir(self):
        """指定工作目录"""
        tool = ShellTool()
        result = await tool.execute(command="pwd", working_dir="/tmp")
        assert result["exit_code"] == 0
        assert "/tmp" in result["output"]
    @pytest.mark.asyncio
    async def test_execute_with_session(self):
        """会话模式执行命令"""
        tool = ShellTool()
        result = await tool.execute(command="echo session_test", session_id="s1")
        assert result["exit_code"] == 0
        assert "session_test" in result["output"]
        assert result["session_id"] == "s1"
    @pytest.mark.asyncio
    async def test_session_preserves_cwd(self):
        """会话模式保持 cwd"""
        tool = ShellTool()
        await tool.execute(command="cd /tmp", session_id="cwd-test")
        result = await tool.execute(command="pwd", session_id="cwd-test")
        assert "/tmp" in result["output"]
    @pytest.mark.asyncio
    async def test_session_preserves_env(self):
        """会话模式保持 env"""
        tool = ShellTool()
        await tool.execute(
            command="export SHELL_TEST_VAR=world", session_id="env-test"
        )
        result = await tool.execute(
            command="echo $SHELL_TEST_VAR", session_id="env-test"
        )
        assert "world" in result["output"]
    @pytest.mark.asyncio
    async def test_no_session_id_backward_compatible(self):
        """无 session_id 时保持现有行为"""
        tool = ShellTool()
        result = await tool.execute(command="echo no_session")
        assert result["exit_code"] == 0
        assert "no_session" in result["output"]
        assert result["session_id"] is None
    @pytest.mark.asyncio
    async def test_different_sessions_independent(self):
        """不同会话互不影响"""
        tool = ShellTool()
        await tool.execute(command="cd /tmp", session_id="s1")
        await tool.execute(command="cd /usr", session_id="s2")
        r1 = await tool.execute(command="pwd", session_id="s1")
        r2 = await tool.execute(command="pwd", session_id="s2")
        assert "/tmp" in r1["output"]
        assert "/usr" in r2["output"]
 class TestShellToolSecurity:
    """测试 ShellTool 安全控制"""
    @pytest.mark.asyncio
    async def test_safe_command_allowed(self):
        """安全命令直接执行"""
        tool = ShellTool()
        result = await tool.execute(command="ls /tmp")
        assert result["exit_code"] == 0
    @pytest.mark.asyncio
    async def test_dangerous_command_blocked_without_callback(self):
        """危险命令无确认回调时被拒绝"""
        tool = ShellTool()
        result = await tool.execute(command="rm -rf /tmp/test")
        assert result["is_error"] is True
        assert result["exit_code"] == 126
    @pytest.mark.asyncio
    async def test_dangerous_command_confirmed(self):
        """危险命令通过确认回调允许执行"""
        confirm = AsyncMock(return_value=True)
        tool = ShellTool(confirm_callback=confirm)
        result = await tool.execute(command="rm -rf /tmp/nonexistent_test_dir")
        assert confirm.called
        # 命令本身可能失败（目录不存在），但不应被安全机制拒绝
        assert result["exit_code"] != 126 or not result["is_error"]
    @pytest.mark.asyncio
    async def test_dangerous_command_rejected_by_callback(self):
        """确认回调拒绝危险命令"""
        confirm = AsyncMock(return_value=False)
        tool = ShellTool(confirm_callback=confirm)
        result = await tool.execute(command="rm -rf /tmp/test")
        assert result["is_error"] is True
        assert result["exit_code"] == 126
    @pytest.mark.asyncio
    async def test_audit_log_recorded(self):
        """审计日志记录"""
        tool = ShellTool()
        await tool.execute(command="echo audit_test")
        assert len(tool.audit_log) > 0
        assert tool.audit_log[0]["command"] == "echo audit_test"
    @pytest.mark.asyncio
    async def test_blocked_command_in_audit_log(self):
        """被阻止的命令记录在审计日志"""
        tool = ShellTool()
        await tool.execute(command="rm -rf /tmp/test")
        blocked_entries = [e for e in tool.audit_log if e.get("blocked")]
        assert len(blocked_entries) > 0
    @pytest.mark.asyncio
    async def test_git_push_force_is_dangerous(self):
        """git push --force 是危险命令"""
        tool = ShellTool()
        result = await tool.execute(command="git push --force origin main")
        assert result["is_error"] is True
        assert result["exit_code"] == 126
    @pytest.mark.asyncio
    async def test_git_status_is_safe(self):
        """git status 是安全命令"""
        tool = ShellTool()
        result = await tool.execute(command="git status")
        # git status 可能在非 git 目录失败，但不应被安全机制拒绝
        assert result["exit_code"] != 126
 class TestShellToolOutputParsing:
    """测试 ShellTool 输出解析集成"""
    @pytest.mark.asyncio
    async def test_error_output_structured(self):
        """错误输出结构化"""
        tool = ShellTool()
        result = await tool.execute(command="ls /nonexistent_dir_xyz_12345")
        assert result["is_error"] is True
        assert result["error_type"] in ("not_found", "unknown")
        assert isinstance(result["suggestions"], list)
    @pytest.mark.asyncio
    async def test_success_output_not_error(self):
        """成功输出不标记为错误"""
        tool = ShellTool()
        result = await tool.execute(command="echo success")
        assert result["is_error"] is False
        assert result["error_type"] == "none"
 class TestShellToolSessionManager:
    """测试 ShellTool 会话管理器访问"""
    def test_session_manager_accessible(self):
        tool = ShellTool()
        assert tool.session_manager is not None
    @pytest.mark.asyncio
    async def test_session_created_on_first_use(self):
        """首次使用 session_id 时创建会话"""
        tool = ShellTool()
        assert not tool.session_manager.has_session("new-session")
        await tool.execute(command="echo test", session_id="new-session")
        assert tool.session_manager.has_session("new-session")