fischer-agentkit/tests/manual/test_react_l4_smoke.py

296 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""ReAct L4 真实 LLM smoke test (U6).
手动验证 U4 的 L0 规则重排(`_build_tool_use_prompt` 中"何时必须使用工具"前置)
在真实 LLM 调用下是否生效 — Agent 面对复杂需求时是否调用 `web_search` 而非
直接回答。
不进 CI依赖真实 LLM API key + 网络)。运行方式:
python3 tests/manual/test_react_l4_smoke.py
判定标准plan U6
- Probe 1-4 期望触发 web_search≥3/4 pass 算通过)
- Probe 5 期望不触发工具调用(验证 escape hatch 规则 4 仍有效)
- 通过 → Bug 2 状态升级为 "L4 verified"
- 未通过 → 触发 L1工具描述扩展独立 plan
ponytail: 直接复用 cli.chat._build_gateway + server.app._create_provider
不重新实现 provider 注册逻辑。升级路径:抽到 shared 模块供 cli/server/manual
共用。
"""
from __future__ import annotations
import asyncio
import sys
import time
from dataclasses import dataclass
from pathlib import Path
# 确保可以 import agentkit 包(脚本从仓库根目录运行)
ROOT = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(ROOT / "src"))
from agentkit.core.react import ReActEngine, ReActResult, ReActStep # noqa: E402
from agentkit.server.config import find_config_path, load_config_with_dotenv # noqa: E402
from agentkit.tools.web_search import WebSearchTool # noqa: E402
# ---------------------------------------------------------------------------
# Probe queries
# ---------------------------------------------------------------------------
@dataclass
class Probe:
"""单个 probe query 及其期望行为。"""
id: int
category: str # external_info / realtime_data / multi_step / uncertain / no_tool
query: str
expect_tool_call: bool # True: 期望触发 web_searchFalse: 期望直接回答
PROBES: list[Probe] = [
Probe(
id=1,
category="external_info",
query="收集 GitHub Trending 前 10 个项目信息并分析商业价值",
expect_tool_call=True,
),
Probe(
id=2,
category="realtime_data",
query="最新 AI 领域有什么重要新闻?",
expect_tool_call=True,
),
Probe(
id=3,
category="multi_step",
query="对比 React 和 Vue 3 在大型项目中的性能差异,给出具体数据",
expect_tool_call=True,
),
Probe(
id=4,
category="realtime_data_simple",
query="今天上海天气怎么样?",
expect_tool_call=True,
),
Probe(
id=5,
category="no_tool_escape_hatch",
query="请帮我总结下面这段文字人工智能AI是计算机科学的一个分支"
"它企图了解智能的实质,并生产出一种新的能以人类智能相似的方式做出"
"反应的智能机器。AI 的研究包括机器人、语言识别、图像识别、自然语言"
"处理和专家系统等。",
expect_tool_call=False,
),
]
# ---------------------------------------------------------------------------
# Probe runner
# ---------------------------------------------------------------------------
def _count_tool_calls(result: ReActResult) -> tuple[int, list[str]]:
"""统计 trajectory 中的 tool_call 步骤,返回 (count, tool_names)。"""
tool_names: list[str] = []
for step in result.trajectory:
if step.action == "tool_call" and step.tool_name:
tool_names.append(step.tool_name)
return len(tool_names), tool_names
def _format_trajectory(steps: list[ReActStep]) -> str:
"""格式化 trajectory 用于报告输出。"""
lines: list[str] = []
for s in steps:
if s.action == "tool_call":
args_preview = str(s.arguments)[:80] if s.arguments else ""
lines.append(f" [{s.step}] tool_call: {s.tool_name}({args_preview})")
elif s.action == "final_answer":
preview = (s.content or "")[:120].replace("\n", " ")
lines.append(f" [{s.step}] final_answer: {preview}...")
else:
lines.append(f" [{s.step}] {s.action}")
return "\n".join(lines) if lines else " (empty)"
async def run_probe(engine: ReActEngine, probe: Probe) -> dict:
"""运行单个 probe返回结果字典。"""
print(f"\n{'='*60}")
print(f"Probe #{probe.id} [{probe.category}]")
print(f"Query: {probe.query[:80]}{'...' if len(probe.query) > 80 else ''}")
print(f"Expect tool_call: {probe.expect_tool_call}")
print(f"{'-'*60}")
messages = [{"role": "user", "content": probe.query}]
start = time.monotonic()
try:
result = await engine.execute(
messages=messages,
tools=[WebSearchTool()],
model="default",
agent_name="l4_smoke",
task_type="smoke_test",
)
except Exception as e:
elapsed = time.monotonic() - start
print(f"ERROR after {elapsed:.1f}s: {type(e).__name__}: {e}")
return {
"probe_id": probe.id,
"category": probe.category,
"expect_tool_call": probe.expect_tool_call,
"actual_tool_calls": 0,
"tool_names": [],
"status": "error",
"error": f"{type(e).__name__}: {e}",
"elapsed_s": elapsed,
"output_preview": "",
"trajectory": "",
}
elapsed = time.monotonic() - start
tool_count, tool_names = _count_tool_calls(result)
# 判定:实际触发工具 == 期望触发工具
actual_triggered = tool_count > 0
passed = actual_triggered == probe.expect_tool_call
# 特例query 5 期望无工具,但如果 LLM 调用了,也算"行为可观察"(只是不符 escape hatch 期望)
status = "pass" if passed else "fail"
output_preview = (result.output or "")[:200].replace("\n", " ")
traj_str = _format_trajectory(result.trajectory)
print(f"Status: {status.upper()} ({elapsed:.1f}s, {result.total_steps} steps)")
print(f"Tool calls: {tool_count} {tool_names}")
print(f"Output preview: {output_preview}")
print(f"Trajectory:\n{traj_str}")
return {
"probe_id": probe.id,
"category": probe.category,
"expect_tool_call": probe.expect_tool_call,
"actual_tool_calls": tool_count,
"tool_names": tool_names,
"status": status,
"error": None,
"elapsed_s": elapsed,
"output_preview": output_preview,
"trajectory": traj_str,
"total_steps": result.total_steps,
"total_tokens": result.total_tokens,
"react_status": result.status,
}
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
async def main() -> int:
"""运行所有 probe输出报告返回退出码0=pass, 1=fail"""
print("=" * 60)
print("ReAct L4 Smoke Test (U6)")
print("Verifies U4 L0 prompt rule rearrangement under real LLM calls")
print("=" * 60)
# 1. 加载 server config从 agentkit.yaml + env
print("\n[1/3] Loading server config...")
try:
config_path = find_config_path()
if not config_path:
print("FATAL: no agentkit.yaml found (./agentkit.yaml or ~/.agentkit/agentkit.yaml)")
return 2
server_config = load_config_with_dotenv(config_path)
except Exception as e:
print(f"FATAL: failed to load config: {type(e).__name__}: {e}")
return 2
if not server_config.llm_config.providers:
print("FATAL: no LLM providers configured (check agentkit.yaml / .env)")
return 2
providers_with_key = [
name for name, p in server_config.llm_config.providers.items()
if p.api_key
]
if not providers_with_key:
print("FATAL: no LLM providers with api_key set")
print("Set env vars (e.g. OPENAI_API_KEY) or configure agentkit.yaml")
return 2
print(f" Providers with key: {providers_with_key}")
print(f" Default model alias: {server_config.llm_config.model_aliases.get('default', '<unset>')}")
# 2. 构建 LLM gateway + ReAct engine
print("\n[2/3] Building LLM gateway + ReAct engine...")
from agentkit.cli.chat import _build_gateway
gateway = _build_gateway(server_config)
if not gateway.has_providers:
print("FATAL: gateway has no registered providers")
return 2
print(f" Gateway providers: {list(gateway._providers.keys())}")
engine = ReActEngine(
llm_gateway=gateway,
max_steps=8, # 留够步数让 LLM 多轮搜索
default_timeout=120.0, # 单 query 上限 2 分钟
enable_tool_search=False, # 简化:直接注入 web_search 完整描述
)
# 3. 运行所有 probe
print(f"\n[3/3] Running {len(PROBES)} probes...")
results: list[dict] = []
for probe in PROBES:
# 每次重置 engine 状态(避免 loop detection 跨 query 误判)
engine.reset()
result = await run_probe(engine, probe)
results.append(result)
# 4. 汇总报告
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"{'#':>3} {'Category':<24} {'Expect':>8} {'Actual':>8} {'Status':>6} {'Time':>6}")
print("-" * 60)
for r in results:
expect_str = "tool" if r["expect_tool_call"] else "direct"
actual_str = f"{r['actual_tool_calls']} tool" if r["actual_tool_calls"] > 0 else "direct"
print(
f"{r['probe_id']:>3} "
f"{r['category']:<24} "
f"{expect_str:>8} "
f"{actual_str:>8} "
f"{r['status']:>6} "
f"{r['elapsed_s']:>5.1f}s"
)
# 5. 判定
print("\n" + "-" * 60)
tool_probes = [r for r in results if r["expect_tool_call"]]
direct_probes = [r for r in results if not r["expect_tool_call"]]
tool_pass = sum(1 for r in tool_probes if r["status"] == "pass")
direct_pass = sum(1 for r in direct_probes if r["status"] == "pass")
print(f"Tool-call probes: {tool_pass}/{len(tool_probes)} passed (threshold: ≥3/4)")
print(f"Direct-answer probes: {direct_pass}/{len(direct_probes)} passed")
overall_pass = (tool_pass >= 3) and (direct_pass == len(direct_probes))
print("\n" + "=" * 60)
if overall_pass:
print("VERDICT: PASS — Bug 2 status upgraded to 'L4 verified'")
print("Action: update plan Progress table U6 → done")
return 0
else:
print("VERDICT: FAIL — L0 rule rearrangement insufficient")
print("Action: trigger L1 (web_search description expansion) as independent plan")
print(" also consider L2 (PLAN_EXEC phase policy) if L1 alone insufficient")
return 1
if __name__ == "__main__":
sys.exit(asyncio.run(main()))