975 lines
34 KiB
Python
975 lines
34 KiB
Python
"""Tests for PlanChecker — 计划检查与复盘"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
from datetime import datetime, timezone
|
|
from typing import Any
|
|
|
|
from agentkit.core.plan_checker import (
|
|
CheckResult,
|
|
CheckStatus,
|
|
PlanChecker,
|
|
QualityGate,
|
|
ReviewReport,
|
|
RuleBasedStepReflector,
|
|
)
|
|
from agentkit.core.plan_executor import PlanExecutionResult, StepExecutionResult
|
|
from agentkit.core.plan_schema import ExecutionPlan, PlanStep, PlanStepStatus
|
|
from agentkit.skills.base import QualityGateConfig
|
|
from agentkit.evolution.experience_store import InMemoryExperienceStore
|
|
from agentkit.evolution.experience_schema import TaskExperience
|
|
|
|
|
|
# --- Helpers ---
|
|
|
|
|
|
def make_step(
|
|
step_id: str = "s0",
|
|
name: str = "Test Step",
|
|
description: str = "A test step",
|
|
**kwargs,
|
|
) -> PlanStep:
|
|
return PlanStep(step_id=step_id, name=name, description=description, **kwargs)
|
|
|
|
|
|
def make_step_result(
|
|
step_id: str = "s0",
|
|
status: PlanStepStatus = PlanStepStatus.COMPLETED,
|
|
result: dict[str, Any] | None = None,
|
|
error: str | None = None,
|
|
retry_count: int = 0,
|
|
duration_ms: float = 100.0,
|
|
) -> StepExecutionResult:
|
|
return StepExecutionResult(
|
|
step_id=step_id,
|
|
status=status,
|
|
result=result,
|
|
error=error,
|
|
retry_count=retry_count,
|
|
duration_ms=duration_ms,
|
|
)
|
|
|
|
|
|
def make_plan_result(
|
|
plan_id: str = "p1",
|
|
step_results: dict[str, StepExecutionResult] | None = None,
|
|
total_duration_ms: float = 500.0,
|
|
) -> PlanExecutionResult:
|
|
from agentkit.core.protocol import TaskStatus
|
|
|
|
if step_results is None:
|
|
step_results = {
|
|
"s0": make_step_result(),
|
|
}
|
|
return PlanExecutionResult(
|
|
plan_id=plan_id,
|
|
step_results=step_results,
|
|
status=TaskStatus.COMPLETED,
|
|
total_duration_ms=total_duration_ms,
|
|
)
|
|
|
|
|
|
def make_plan(
|
|
steps: list[PlanStep] | None = None,
|
|
plan_id: str = "p1",
|
|
goal: str = "test goal",
|
|
) -> ExecutionPlan:
|
|
if steps is None:
|
|
steps = [make_step()]
|
|
return ExecutionPlan(
|
|
plan_id=plan_id,
|
|
goal=goal,
|
|
steps=steps,
|
|
parallel_groups=[[s.step_id for s in steps]],
|
|
confirmed=True,
|
|
)
|
|
|
|
|
|
# --- QualityGate Tests ---
|
|
|
|
|
|
class TestQualityGate:
|
|
"""QualityGate 规则检查"""
|
|
|
|
def test_pass_when_no_config(self):
|
|
"""无配置时所有结果通过"""
|
|
gate = QualityGate()
|
|
step = make_step()
|
|
result = make_step_result(result={"data": "test"})
|
|
check = gate.check(step, result)
|
|
assert check.status == CheckStatus.PASS
|
|
|
|
def test_pass_with_required_fields_present(self):
|
|
"""必填字段全部存在时通过"""
|
|
config = QualityGateConfig(required_fields=["name", "value"])
|
|
gate = QualityGate(config=config)
|
|
step = make_step()
|
|
result = make_step_result(result={"name": "test", "value": 42})
|
|
check = gate.check(step, result)
|
|
assert check.status == CheckStatus.PASS
|
|
|
|
def test_fail_with_missing_required_fields(self):
|
|
"""缺少必填字段时不通过"""
|
|
config = QualityGateConfig(required_fields=["name", "value", "missing"])
|
|
gate = QualityGate(config=config)
|
|
step = make_step()
|
|
result = make_step_result(result={"name": "test", "value": 42})
|
|
check = gate.check(step, result)
|
|
assert check.status == CheckStatus.FAIL
|
|
assert "missing" in check.reason.lower() or "Missing required fields" in check.reason
|
|
|
|
def test_fail_with_none_result_and_required_fields(self):
|
|
"""结果为 None 且有必填字段时不通过"""
|
|
config = QualityGateConfig(required_fields=["name"])
|
|
gate = QualityGate(config=config)
|
|
step = make_step()
|
|
result = make_step_result(result=None)
|
|
check = gate.check(step, result)
|
|
assert check.status == CheckStatus.FAIL
|
|
|
|
def test_pass_with_min_word_count_met(self):
|
|
"""字数满足最低要求时通过"""
|
|
config = QualityGateConfig(min_word_count=3)
|
|
gate = QualityGate(config=config)
|
|
step = make_step()
|
|
result = make_step_result(result={"text": "hello world foo"})
|
|
check = gate.check(step, result)
|
|
assert check.status == CheckStatus.PASS
|
|
|
|
def test_fail_with_min_word_count_not_met(self):
|
|
"""字数不满足最低要求时不通过"""
|
|
config = QualityGateConfig(min_word_count=100)
|
|
gate = QualityGate(config=config)
|
|
step = make_step()
|
|
result = make_step_result(result={"text": "hello"})
|
|
check = gate.check(step, result)
|
|
assert check.status == CheckStatus.FAIL
|
|
assert "word count" in check.reason.lower() or "Word count" in check.reason
|
|
|
|
def test_skip_for_non_completed_step(self):
|
|
"""非完成步骤跳过检查"""
|
|
gate = QualityGate()
|
|
step = make_step()
|
|
result = make_step_result(status=PlanStepStatus.FAILED, error="some error")
|
|
check = gate.check(step, result)
|
|
assert check.status == CheckStatus.SKIP
|
|
|
|
def test_skip_for_skipped_step(self):
|
|
"""跳过的步骤跳过检查"""
|
|
gate = QualityGate()
|
|
step = make_step()
|
|
result = make_step_result(status=PlanStepStatus.SKIPPED, error="skipped")
|
|
check = gate.check(step, result)
|
|
assert check.status == CheckStatus.SKIP
|
|
|
|
def test_custom_validator_pass(self):
|
|
"""自定义校验通过"""
|
|
def validator(result):
|
|
return (True, "")
|
|
|
|
gate = QualityGate(custom_validator=validator)
|
|
step = make_step()
|
|
result = make_step_result(result={"data": "test"})
|
|
check = gate.check(step, result)
|
|
assert check.status == CheckStatus.PASS
|
|
|
|
def test_custom_validator_fail(self):
|
|
"""自定义校验不通过"""
|
|
def validator(result):
|
|
return (False, "Output format incorrect")
|
|
|
|
gate = QualityGate(custom_validator=validator)
|
|
step = make_step()
|
|
result = make_step_result(result={"data": "test"})
|
|
check = gate.check(step, result)
|
|
assert check.status == CheckStatus.FAIL
|
|
assert "Output format incorrect" in check.reason
|
|
|
|
def test_custom_validator_exception(self):
|
|
"""自定义校验抛异常时不通过"""
|
|
def validator(result):
|
|
raise ValueError("Validator crashed")
|
|
|
|
gate = QualityGate(custom_validator=validator)
|
|
step = make_step()
|
|
result = make_step_result(result={"data": "test"})
|
|
check = gate.check(step, result)
|
|
assert check.status == CheckStatus.FAIL
|
|
assert "error" in check.reason.lower() or "Validator crashed" in check.reason
|
|
|
|
def test_combined_required_fields_and_word_count(self):
|
|
"""同时检查必填字段和字数"""
|
|
config = QualityGateConfig(required_fields=["report"], min_word_count=5)
|
|
gate = QualityGate(config=config)
|
|
step = make_step()
|
|
# 字数不足
|
|
result = make_step_result(result={"report": "hi"})
|
|
check = gate.check(step, result)
|
|
assert check.status == CheckStatus.FAIL
|
|
|
|
# 字数满足
|
|
result2 = make_step_result(result={"report": "This is a detailed report content"})
|
|
check2 = gate.check(step, result2)
|
|
assert check2.status == CheckStatus.PASS
|
|
|
|
def test_quality_score_decreases_with_failures(self):
|
|
"""失败项越多质量评分越低"""
|
|
config = QualityGateConfig(required_fields=["a", "b"], min_word_count=100)
|
|
gate = QualityGate(config=config)
|
|
step = make_step()
|
|
result = make_step_result(result={"a": "x"}) # missing b + word count
|
|
check = gate.check(step, result)
|
|
assert check.quality_score < 0.5
|
|
|
|
|
|
# --- RuleBasedStepReflector Tests ---
|
|
|
|
|
|
class TestRuleBasedStepReflector:
|
|
"""基于规则的步骤反思器"""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_completed_step_score(self):
|
|
"""完成步骤获得合理评分"""
|
|
reflector = RuleBasedStepReflector()
|
|
step = make_step()
|
|
result = make_step_result(
|
|
result={"data": "test"},
|
|
retry_count=0,
|
|
duration_ms=5000,
|
|
)
|
|
score, suggestions = await reflector.reflect_step(step, result)
|
|
assert score >= 0.8
|
|
assert len(suggestions) == 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_failed_step_zero_score(self):
|
|
"""失败步骤评分为零"""
|
|
reflector = RuleBasedStepReflector()
|
|
step = make_step()
|
|
result = make_step_result(
|
|
status=PlanStepStatus.FAILED,
|
|
error="Something went wrong",
|
|
)
|
|
score, suggestions = await reflector.reflect_step(step, result)
|
|
assert score == 0.0
|
|
assert len(suggestions) > 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_retry_suggestion(self):
|
|
"""有重试的步骤生成改进建议"""
|
|
reflector = RuleBasedStepReflector()
|
|
step = make_step()
|
|
result = make_step_result(
|
|
result={"data": "test"},
|
|
retry_count=2,
|
|
)
|
|
score, suggestions = await reflector.reflect_step(step, result)
|
|
assert any("retries" in s.lower() or "retry" in s.lower() for s in suggestions)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_slow_step_suggestion(self):
|
|
"""慢步骤生成优化建议"""
|
|
reflector = RuleBasedStepReflector()
|
|
step = make_step()
|
|
result = make_step_result(
|
|
result={"data": "test"},
|
|
duration_ms=120000, # 120s
|
|
)
|
|
score, suggestions = await reflector.reflect_step(step, result)
|
|
assert any("slow" in s.lower() or "optimizing" in s.lower() for s in suggestions)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_timeout_error_suggestion(self):
|
|
"""超时错误生成超时相关建议"""
|
|
reflector = RuleBasedStepReflector()
|
|
step = make_step()
|
|
result = make_step_result(
|
|
status=PlanStepStatus.FAILED,
|
|
error="Step timed out after 300s",
|
|
)
|
|
score, suggestions = await reflector.reflect_step(step, result)
|
|
assert any("timed out" in s.lower() or "timeout" in s.lower() for s in suggestions)
|
|
|
|
|
|
# --- PlanChecker.check_step Tests ---
|
|
|
|
|
|
class TestPlanCheckerCheckStep:
|
|
"""PlanChecker 单步检查"""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_check_step_pass(self):
|
|
"""步骤通过检查"""
|
|
checker = PlanChecker()
|
|
step = make_step()
|
|
result = make_step_result(result={"data": "test"})
|
|
check = await checker.check_step(step, result)
|
|
assert check.status == CheckStatus.PASS
|
|
assert check.quality_score > 0.5
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_check_step_fail_quality_gate(self):
|
|
"""步骤不通过质量门控"""
|
|
config = QualityGateConfig(required_fields=["missing_field"])
|
|
checker = PlanChecker(quality_gate_config=config)
|
|
step = make_step()
|
|
result = make_step_result(result={"data": "test"})
|
|
check = await checker.check_step(step, result)
|
|
assert check.status == CheckStatus.FAIL
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_check_step_skip_for_failed_status(self):
|
|
"""失败步骤跳过检查"""
|
|
checker = PlanChecker()
|
|
step = make_step()
|
|
result = make_step_result(status=PlanStepStatus.FAILED, error="error")
|
|
check = await checker.check_step(step, result)
|
|
assert check.status == CheckStatus.SKIP
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_check_step_records_result(self):
|
|
"""检查结果被记录"""
|
|
checker = PlanChecker()
|
|
step = make_step(step_id="s1")
|
|
result = make_step_result(step_id="s1", result={"data": "test"})
|
|
await checker.check_step(step, result)
|
|
assert "s1" in checker._check_results
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_check_step_with_step_specific_config(self):
|
|
"""步骤独立质量配置"""
|
|
step_configs = {
|
|
"s0": QualityGateConfig(required_fields=["report"]),
|
|
"s1": QualityGateConfig(required_fields=["analysis"]),
|
|
}
|
|
checker = PlanChecker(step_quality_configs=step_configs)
|
|
|
|
# s0 缺少 report
|
|
step0 = make_step(step_id="s0")
|
|
result0 = make_step_result(step_id="s0", result={"data": "test"})
|
|
check0 = await checker.check_step(step0, result0)
|
|
assert check0.status == CheckStatus.FAIL
|
|
|
|
# s1 有 analysis
|
|
step1 = make_step(step_id="s1")
|
|
result1 = make_step_result(step_id="s1", result={"analysis": "result"})
|
|
check1 = await checker.check_step(step1, result1)
|
|
assert check1.status == CheckStatus.PASS
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_check_step_with_custom_validator(self):
|
|
"""自定义校验器"""
|
|
def validator(result):
|
|
if result and result.get("format") == "json":
|
|
return (True, "")
|
|
return (False, "Expected JSON format")
|
|
|
|
checker = PlanChecker(custom_validator=validator)
|
|
step = make_step()
|
|
|
|
# 格式正确
|
|
result_ok = make_step_result(result={"format": "json", "data": {}})
|
|
check_ok = await checker.check_step(step, result_ok)
|
|
assert check_ok.status == CheckStatus.PASS
|
|
|
|
# 格式不正确
|
|
result_bad = make_step_result(result={"format": "xml", "data": {}})
|
|
check_bad = await checker.check_step(step, result_bad)
|
|
assert check_bad.status == CheckStatus.FAIL
|
|
|
|
|
|
# --- PlanChecker.should_retry / should_request_human Tests ---
|
|
|
|
|
|
class TestPlanCheckerRetryAndHuman:
|
|
"""重试与人工介入判断"""
|
|
|
|
def test_should_retry_on_fail_within_limit(self):
|
|
"""检查不通过且重试次数未耗尽时应重试"""
|
|
checker = PlanChecker(max_check_retries=2)
|
|
check = CheckResult(step_id="s0", status=CheckStatus.FAIL, reason="quality low")
|
|
assert checker.should_retry(check, 0) is True
|
|
assert checker.should_retry(check, 1) is True
|
|
|
|
def test_should_not_retry_on_pass(self):
|
|
"""检查通过时不应重试"""
|
|
checker = PlanChecker(max_check_retries=2)
|
|
check = CheckResult(step_id="s0", status=CheckStatus.PASS)
|
|
assert checker.should_retry(check, 0) is False
|
|
|
|
def test_should_not_retry_on_skip(self):
|
|
"""跳过检查时不应重试"""
|
|
checker = PlanChecker(max_check_retries=2)
|
|
check = CheckResult(step_id="s0", status=CheckStatus.SKIP)
|
|
assert checker.should_retry(check, 0) is False
|
|
|
|
def test_should_not_retry_exhausted(self):
|
|
"""重试次数耗尽时不应重试"""
|
|
checker = PlanChecker(max_check_retries=1)
|
|
check = CheckResult(step_id="s0", status=CheckStatus.FAIL, reason="quality low")
|
|
assert checker.should_retry(check, 1) is False
|
|
|
|
def test_should_request_human_on_exhausted_retries(self):
|
|
"""重试耗尽后应请求人工介入"""
|
|
checker = PlanChecker(max_check_retries=1)
|
|
check = CheckResult(step_id="s0", status=CheckStatus.FAIL, reason="quality low")
|
|
assert checker.should_request_human(check, 1) is True
|
|
|
|
def test_should_not_request_human_on_pass(self):
|
|
"""检查通过时不应请求人工介入"""
|
|
checker = PlanChecker(max_check_retries=1)
|
|
check = CheckResult(step_id="s0", status=CheckStatus.PASS)
|
|
assert checker.should_request_human(check, 0) is False
|
|
|
|
def test_should_not_request_human_within_retries(self):
|
|
"""重试次数未耗尽时不应请求人工介入"""
|
|
checker = PlanChecker(max_check_retries=2)
|
|
check = CheckResult(step_id="s0", status=CheckStatus.FAIL, reason="quality low")
|
|
assert checker.should_request_human(check, 0) is False
|
|
|
|
|
|
# --- PlanChecker.review_plan Tests ---
|
|
|
|
|
|
class TestPlanCheckerReviewPlan:
|
|
"""复盘报告生成"""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_all_steps_pass_review(self):
|
|
"""所有步骤通过检查 → 生成复盘报告"""
|
|
checker = PlanChecker()
|
|
step0 = make_step(step_id="s0", name="Search")
|
|
step1 = make_step(step_id="s1", name="Analyze")
|
|
|
|
plan = make_plan(steps=[step0, step1])
|
|
plan_result = make_plan_result(
|
|
step_results={
|
|
"s0": make_step_result(step_id="s0", result={"data": "A"}),
|
|
"s1": make_step_result(step_id="s1", result={"data": "B"}),
|
|
},
|
|
)
|
|
|
|
# 先检查每步
|
|
await checker.check_step(step0, plan_result.step_results["s0"])
|
|
await checker.check_step(step1, plan_result.step_results["s1"])
|
|
|
|
# 复盘
|
|
report = await checker.review_plan(plan, plan_result)
|
|
|
|
assert report.outcome == "success"
|
|
assert "s0" in report.success_path
|
|
assert "s1" in report.success_path
|
|
assert len(report.failure_reasons) == 0
|
|
assert report.success_rate == 1.0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_partial_failure_review(self):
|
|
"""部分步骤失败 → 复盘报告包含失败原因"""
|
|
checker = PlanChecker()
|
|
step0 = make_step(step_id="s0", name="Search")
|
|
step1 = make_step(step_id="s1", name="Analyze")
|
|
|
|
plan = make_plan(steps=[step0, step1])
|
|
plan_result = make_plan_result(
|
|
step_results={
|
|
"s0": make_step_result(step_id="s0", result={"data": "A"}),
|
|
"s1": make_step_result(
|
|
step_id="s1",
|
|
status=PlanStepStatus.FAILED,
|
|
error="Agent crashed",
|
|
),
|
|
},
|
|
)
|
|
|
|
await checker.check_step(step0, plan_result.step_results["s0"])
|
|
await checker.check_step(step1, plan_result.step_results["s1"])
|
|
|
|
report = await checker.review_plan(plan, plan_result)
|
|
|
|
assert report.outcome == "partial"
|
|
assert "s0" in report.success_path
|
|
assert len(report.failure_reasons) > 0
|
|
assert any("s1" in r for r in report.failure_reasons)
|
|
assert report.success_rate == 0.5
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_all_failure_review(self):
|
|
"""全部步骤失败 → 复盘报告 outcome 为 failure"""
|
|
checker = PlanChecker()
|
|
step0 = make_step(step_id="s0", name="Search")
|
|
|
|
plan = make_plan(steps=[step0])
|
|
plan_result = make_plan_result(
|
|
step_results={
|
|
"s0": make_step_result(
|
|
step_id="s0",
|
|
status=PlanStepStatus.FAILED,
|
|
error="Agent unavailable",
|
|
),
|
|
},
|
|
)
|
|
|
|
await checker.check_step(step0, plan_result.step_results["s0"])
|
|
|
|
report = await checker.review_plan(plan, plan_result)
|
|
|
|
assert report.outcome == "failure"
|
|
assert len(report.failure_reasons) > 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_review_report_contains_duration_distribution(self):
|
|
"""复盘报告包含耗时分布"""
|
|
checker = PlanChecker()
|
|
step0 = make_step(step_id="s0")
|
|
step1 = make_step(step_id="s1")
|
|
|
|
plan = make_plan(steps=[step0, step1])
|
|
plan_result = make_plan_result(
|
|
step_results={
|
|
"s0": make_step_result(step_id="s0", result={"data": "A"}, duration_ms=100.0),
|
|
"s1": make_step_result(step_id="s1", result={"data": "B"}, duration_ms=200.0),
|
|
},
|
|
)
|
|
|
|
await checker.check_step(step0, plan_result.step_results["s0"])
|
|
await checker.check_step(step1, plan_result.step_results["s1"])
|
|
|
|
report = await checker.review_plan(plan, plan_result)
|
|
|
|
assert "s0" in report.duration_distribution
|
|
assert "s1" in report.duration_distribution
|
|
assert report.duration_distribution["s0"] == 100.0
|
|
assert report.duration_distribution["s1"] == 200.0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_review_report_contains_quality_scores(self):
|
|
"""复盘报告包含质量评分"""
|
|
checker = PlanChecker()
|
|
step0 = make_step(step_id="s0")
|
|
|
|
plan = make_plan(steps=[step0])
|
|
plan_result = make_plan_result(
|
|
step_results={
|
|
"s0": make_step_result(step_id="s0", result={"data": "A"}),
|
|
},
|
|
)
|
|
|
|
await checker.check_step(step0, plan_result.step_results["s0"])
|
|
|
|
report = await checker.review_plan(plan, plan_result)
|
|
|
|
assert "s0" in report.quality_scores
|
|
assert report.quality_scores["s0"] > 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_review_report_contains_optimization_tips(self):
|
|
"""复盘报告包含优化建议"""
|
|
checker = PlanChecker()
|
|
step0 = make_step(step_id="s0")
|
|
|
|
plan = make_plan(steps=[step0])
|
|
plan_result = make_plan_result(
|
|
step_results={
|
|
"s0": make_step_result(
|
|
step_id="s0",
|
|
result={"data": "A"},
|
|
retry_count=2,
|
|
duration_ms=120000,
|
|
),
|
|
},
|
|
)
|
|
|
|
await checker.check_step(step0, plan_result.step_results["s0"])
|
|
|
|
report = await checker.review_plan(plan, plan_result)
|
|
|
|
assert len(report.optimization_tips) > 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_review_report_to_dict(self):
|
|
"""复盘报告可序列化为字典"""
|
|
checker = PlanChecker()
|
|
step0 = make_step(step_id="s0")
|
|
|
|
plan = make_plan(steps=[step0])
|
|
plan_result = make_plan_result(
|
|
step_results={
|
|
"s0": make_step_result(step_id="s0", result={"data": "A"}),
|
|
},
|
|
)
|
|
|
|
await checker.check_step(step0, plan_result.step_results["s0"])
|
|
report = await checker.review_plan(plan, plan_result)
|
|
|
|
d = report.to_dict()
|
|
assert d["plan_id"] == "p1"
|
|
assert d["outcome"] == "success"
|
|
assert isinstance(d["success_path"], list)
|
|
assert isinstance(d["failure_reasons"], list)
|
|
assert isinstance(d["optimization_tips"], list)
|
|
|
|
|
|
# --- PlanChecker + ExperienceStore Integration Tests ---
|
|
|
|
|
|
class TestPlanCheckerExperienceStore:
|
|
"""复盘结果写入经验库"""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_experience_written_on_review(self):
|
|
"""复盘结果写入 ExperienceStore"""
|
|
store = InMemoryExperienceStore()
|
|
checker = PlanChecker(experience_store=store)
|
|
|
|
step0 = make_step(step_id="s0")
|
|
plan = make_plan(steps=[step0])
|
|
plan_result = make_plan_result(
|
|
step_results={
|
|
"s0": make_step_result(step_id="s0", result={"data": "A"}),
|
|
},
|
|
)
|
|
|
|
await checker.check_step(step0, plan_result.step_results["s0"])
|
|
report = await checker.review_plan(
|
|
plan, plan_result, task_type="test_task", goal="test goal"
|
|
)
|
|
|
|
# 验证经验已写入
|
|
results = await store.search("test_task", top_k=10)
|
|
assert len(results) == 1
|
|
assert results[0].outcome == "success"
|
|
assert results[0].task_type == "test_task"
|
|
assert results[0].goal == "test goal"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_failure_experience_written(self):
|
|
"""失败经验写入后可检索到"""
|
|
store = InMemoryExperienceStore()
|
|
checker = PlanChecker(experience_store=store)
|
|
|
|
step0 = make_step(step_id="s0")
|
|
plan = make_plan(steps=[step0])
|
|
plan_result = make_plan_result(
|
|
step_results={
|
|
"s0": make_step_result(
|
|
step_id="s0",
|
|
status=PlanStepStatus.FAILED,
|
|
error="Agent crashed",
|
|
),
|
|
},
|
|
)
|
|
|
|
await checker.check_step(step0, plan_result.step_results["s0"])
|
|
report = await checker.review_plan(
|
|
plan, plan_result, task_type="risky_task", goal="risky goal"
|
|
)
|
|
|
|
# 验证失败经验已写入
|
|
results = await store.search("risky_task", top_k=10)
|
|
assert len(results) == 1
|
|
assert results[0].outcome == "failure"
|
|
assert len(results[0].failure_reasons) > 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_experience_searchable_by_failure_reason(self):
|
|
"""AE3: 错误经验写入后,后续任务能检索到避坑预警"""
|
|
store = InMemoryExperienceStore()
|
|
|
|
# 第一次:记录失败经验
|
|
checker = PlanChecker(experience_store=store)
|
|
step0 = make_step(step_id="s0")
|
|
plan = make_plan(steps=[step0])
|
|
plan_result = make_plan_result(
|
|
step_results={
|
|
"s0": make_step_result(
|
|
step_id="s0",
|
|
status=PlanStepStatus.FAILED,
|
|
error="Database connection timeout",
|
|
),
|
|
},
|
|
)
|
|
|
|
await checker.check_step(step0, plan_result.step_results["s0"])
|
|
await checker.review_plan(
|
|
plan, plan_result, task_type="db_query", goal="query database"
|
|
)
|
|
|
|
# 第二次:搜索相关经验
|
|
results = await store.search("database timeout", top_k=5, task_type="db_query")
|
|
assert len(results) >= 1
|
|
assert results[0].outcome == "failure"
|
|
assert any("timeout" in r.lower() for r in results[0].failure_reasons)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_no_experience_store_still_works(self):
|
|
"""无 ExperienceStore 时复盘仍正常工作"""
|
|
checker = PlanChecker() # 无 experience_store
|
|
step0 = make_step(step_id="s0")
|
|
plan = make_plan(steps=[step0])
|
|
plan_result = make_plan_result(
|
|
step_results={
|
|
"s0": make_step_result(step_id="s0", result={"data": "A"}),
|
|
},
|
|
)
|
|
|
|
await checker.check_step(step0, plan_result.step_results["s0"])
|
|
report = await checker.review_plan(plan, plan_result)
|
|
|
|
assert report.outcome == "success"
|
|
assert report.plan_id == "p1"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_experience_store_error_does_not_crash(self):
|
|
"""ExperienceStore 写入异常不影响复盘"""
|
|
class FailingStore:
|
|
async def record_experience(self, experience):
|
|
raise RuntimeError("Store is down")
|
|
|
|
checker = PlanChecker(experience_store=FailingStore())
|
|
step0 = make_step(step_id="s0")
|
|
plan = make_plan(steps=[step0])
|
|
plan_result = make_plan_result(
|
|
step_results={
|
|
"s0": make_step_result(step_id="s0", result={"data": "A"}),
|
|
},
|
|
)
|
|
|
|
await checker.check_step(step0, plan_result.step_results["s0"])
|
|
# 不应抛出异常
|
|
report = await checker.review_plan(plan, plan_result)
|
|
assert report.outcome == "success"
|
|
|
|
|
|
# --- PlanChecker + PlanExecutor Integration Pattern Tests ---
|
|
|
|
|
|
class TestPlanCheckerExecutorIntegration:
|
|
"""PlanChecker 与 PlanExecutor 集成模式"""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_make_step_complete_callback(self):
|
|
"""make_step_complete_callback 创建的回调正确记录检查结果"""
|
|
checker = PlanChecker()
|
|
callback = checker.make_step_complete_callback()
|
|
|
|
step = make_step(step_id="s0")
|
|
result = make_step_result(step_id="s0", result={"data": "test"})
|
|
|
|
await callback(step, result)
|
|
|
|
assert "s0" in checker._check_results
|
|
assert checker._check_results["s0"].status == CheckStatus.PASS
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_full_check_review_cycle(self):
|
|
"""完整的 检查→复盘→经验写入 闭环"""
|
|
store = InMemoryExperienceStore()
|
|
checker = PlanChecker(experience_store=store)
|
|
|
|
# 模拟 3 步计划
|
|
step0 = make_step(step_id="s0", name="Search")
|
|
step1 = make_step(step_id="s1", name="Analyze")
|
|
step2 = make_step(step_id="s2", name="Report")
|
|
|
|
plan = make_plan(steps=[step0, step1, step2])
|
|
plan_result = make_plan_result(
|
|
step_results={
|
|
"s0": make_step_result(step_id="s0", result={"search": "data"}, duration_ms=500),
|
|
"s1": make_step_result(step_id="s1", result={"analysis": "result"}, duration_ms=1500),
|
|
"s2": make_step_result(step_id="s2", result={"report": "done"}, duration_ms=800),
|
|
},
|
|
)
|
|
|
|
# 逐步检查
|
|
await checker.check_step(step0, plan_result.step_results["s0"])
|
|
await checker.check_step(step1, plan_result.step_results["s1"])
|
|
await checker.check_step(step2, plan_result.step_results["s2"])
|
|
|
|
# 复盘
|
|
report = await checker.review_plan(
|
|
plan, plan_result, task_type="analysis", goal="analyze data"
|
|
)
|
|
|
|
# 验证复盘报告
|
|
assert report.outcome == "success"
|
|
assert len(report.success_path) == 3
|
|
assert report.success_rate == 1.0
|
|
assert len(report.duration_distribution) == 3
|
|
|
|
# 验证经验已写入
|
|
results = await store.search("analysis", top_k=10)
|
|
assert len(results) == 1
|
|
assert results[0].success_rate == 1.0
|
|
|
|
|
|
# --- PlanChecker Reset Tests ---
|
|
|
|
|
|
class TestPlanCheckerReset:
|
|
"""重置内部状态"""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_reset_clears_check_results(self):
|
|
"""reset 清除检查结果"""
|
|
checker = PlanChecker()
|
|
step = make_step(step_id="s0")
|
|
result = make_step_result(result={"data": "test"})
|
|
|
|
await checker.check_step(step, result)
|
|
assert len(checker._check_results) > 0
|
|
|
|
checker.reset()
|
|
assert len(checker._check_results) == 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_reset_allows_new_check_cycle(self):
|
|
"""重置后可开始新一轮检查"""
|
|
checker = PlanChecker()
|
|
step = make_step(step_id="s0")
|
|
|
|
# 第一轮
|
|
result1 = make_step_result(result={"data": "test1"})
|
|
await checker.check_step(step, result1)
|
|
checker.reset()
|
|
|
|
# 第二轮
|
|
result2 = make_step_result(result={"data": "test2"})
|
|
check = await checker.check_step(step, result2)
|
|
assert check.status == CheckStatus.PASS
|
|
|
|
|
|
# --- PlanChecker without LLM Tests ---
|
|
|
|
|
|
class TestPlanCheckerWithoutLLM:
|
|
"""PlanChecker 无 LLM 回退到规则检查"""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_works_without_llm(self):
|
|
"""无 LLM 时使用 RuleBasedStepReflector"""
|
|
checker = PlanChecker() # 默认使用 RuleBasedStepReflector
|
|
step = make_step()
|
|
result = make_step_result(result={"data": "test"})
|
|
check = await checker.check_step(step, result)
|
|
assert check.status == CheckStatus.PASS
|
|
assert check.quality_score > 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_custom_reflector(self):
|
|
"""自定义反思器"""
|
|
class CustomReflector:
|
|
async def reflect_step(self, step, exec_result):
|
|
return (0.9, ["Custom suggestion"])
|
|
|
|
checker = PlanChecker(reflector=CustomReflector())
|
|
step = make_step()
|
|
result = make_step_result(result={"data": "test"})
|
|
check = await checker.check_step(step, result)
|
|
assert check.status == CheckStatus.PASS
|
|
assert "Custom suggestion" in check.details.get("reflector_suggestions", [])
|
|
|
|
|
|
# --- Edge Cases ---
|
|
|
|
|
|
class TestPlanCheckerEdgeCases:
|
|
"""边界情况"""
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_empty_plan_review(self):
|
|
"""空计划复盘"""
|
|
checker = PlanChecker()
|
|
plan = make_plan(steps=[])
|
|
plan_result = make_plan_result(step_results={})
|
|
report = await checker.review_plan(plan, plan_result)
|
|
assert report.outcome == "success"
|
|
assert report.success_rate == 0.0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_all_skipped_steps_review(self):
|
|
"""全部跳过步骤的复盘"""
|
|
checker = PlanChecker()
|
|
step0 = make_step(step_id="s0")
|
|
step1 = make_step(step_id="s1")
|
|
|
|
plan = make_plan(steps=[step0, step1])
|
|
plan_result = make_plan_result(
|
|
step_results={
|
|
"s0": make_step_result(
|
|
step_id="s0",
|
|
status=PlanStepStatus.SKIPPED,
|
|
error="Dependency failed",
|
|
),
|
|
"s1": make_step_result(
|
|
step_id="s1",
|
|
status=PlanStepStatus.SKIPPED,
|
|
error="Dependency failed",
|
|
),
|
|
},
|
|
)
|
|
|
|
report = await checker.review_plan(plan, plan_result)
|
|
assert report.outcome == "failure"
|
|
assert len(report.failure_reasons) > 0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_quality_threshold_triggers_fail(self):
|
|
"""质量评分低于阈值触发不通过"""
|
|
class LowScoreReflector:
|
|
async def reflect_step(self, step, exec_result):
|
|
return (0.2, ["Low quality output"])
|
|
|
|
checker = PlanChecker(
|
|
reflector=LowScoreReflector(),
|
|
quality_threshold=0.5,
|
|
)
|
|
step = make_step()
|
|
result = make_step_result(result={"data": "test"})
|
|
check = await checker.check_step(step, result)
|
|
# 综合评分 = 0.4 * 1.0 (gate) + 0.6 * 0.2 (reflector) = 0.52
|
|
# 如果 reflector 评分很低,可能低于阈值
|
|
assert check.quality_score < 1.0
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_reflector_exception_handled(self):
|
|
"""Reflector 异常不影响检查"""
|
|
class CrashingReflector:
|
|
async def reflect_step(self, step, exec_result):
|
|
raise RuntimeError("Reflector crashed")
|
|
|
|
checker = PlanChecker(reflector=CrashingReflector())
|
|
step = make_step()
|
|
result = make_step_result(result={"data": "test"})
|
|
check = await checker.check_step(step, result)
|
|
# 应该回退到 gate 的评分
|
|
assert check.status in (CheckStatus.PASS, CheckStatus.FAIL)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_multiple_quality_failures_in_review(self):
|
|
"""多个步骤质量检查不通过,复盘报告汇总所有原因"""
|
|
config = QualityGateConfig(required_fields=["report"])
|
|
checker = PlanChecker(quality_gate_config=config)
|
|
|
|
step0 = make_step(step_id="s0")
|
|
step1 = make_step(step_id="s1")
|
|
|
|
plan = make_plan(steps=[step0, step1])
|
|
plan_result = make_plan_result(
|
|
step_results={
|
|
"s0": make_step_result(step_id="s0", result={"data": "no report"}),
|
|
"s1": make_step_result(step_id="s1", result={"data": "also no report"}),
|
|
},
|
|
)
|
|
|
|
await checker.check_step(step0, plan_result.step_results["s0"])
|
|
await checker.check_step(step1, plan_result.step_results["s1"])
|
|
|
|
report = await checker.review_plan(plan, plan_result)
|
|
# 质量检查不通过的原因应出现在 failure_reasons 中
|
|
quality_fail_reasons = [
|
|
r for r in report.failure_reasons if "quality check failed" in r
|
|
]
|
|
assert len(quality_fail_reasons) == 2
|