296 lines
11 KiB
Python
296 lines
11 KiB
Python
"""ReAct L4 真实 LLM smoke test (U6).
|
||
|
||
手动验证 U4 的 L0 规则重排(`_build_tool_use_prompt` 中"何时必须使用工具"前置)
|
||
在真实 LLM 调用下是否生效 — Agent 面对复杂需求时是否调用 `web_search` 而非
|
||
直接回答。
|
||
|
||
不进 CI(依赖真实 LLM API key + 网络)。运行方式:
|
||
|
||
python3 tests/manual/test_react_l4_smoke.py
|
||
|
||
判定标准(plan U6):
|
||
- Probe 1-4 期望触发 web_search(≥3/4 pass 算通过)
|
||
- Probe 5 期望不触发工具调用(验证 escape hatch 规则 4 仍有效)
|
||
- 通过 → Bug 2 状态升级为 "L4 verified"
|
||
- 未通过 → 触发 L1(工具描述扩展)独立 plan
|
||
|
||
ponytail: 直接复用 cli.chat._build_gateway + server.app._create_provider,
|
||
不重新实现 provider 注册逻辑。升级路径:抽到 shared 模块供 cli/server/manual
|
||
共用。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import sys
|
||
import time
|
||
from dataclasses import dataclass
|
||
from pathlib import Path
|
||
|
||
# 确保可以 import agentkit 包(脚本从仓库根目录运行)
|
||
ROOT = Path(__file__).resolve().parent.parent.parent
|
||
sys.path.insert(0, str(ROOT / "src"))
|
||
|
||
from agentkit.core.react import ReActEngine, ReActResult, ReActStep # noqa: E402
|
||
from agentkit.server.config import find_config_path, load_config_with_dotenv # noqa: E402
|
||
from agentkit.tools.web_search import WebSearchTool # noqa: E402
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Probe queries
|
||
# ---------------------------------------------------------------------------
|
||
|
||
@dataclass
|
||
class Probe:
|
||
"""单个 probe query 及其期望行为。"""
|
||
id: int
|
||
category: str # external_info / realtime_data / multi_step / uncertain / no_tool
|
||
query: str
|
||
expect_tool_call: bool # True: 期望触发 web_search;False: 期望直接回答
|
||
|
||
|
||
PROBES: list[Probe] = [
|
||
Probe(
|
||
id=1,
|
||
category="external_info",
|
||
query="收集 GitHub Trending 前 10 个项目信息并分析商业价值",
|
||
expect_tool_call=True,
|
||
),
|
||
Probe(
|
||
id=2,
|
||
category="realtime_data",
|
||
query="最新 AI 领域有什么重要新闻?",
|
||
expect_tool_call=True,
|
||
),
|
||
Probe(
|
||
id=3,
|
||
category="multi_step",
|
||
query="对比 React 和 Vue 3 在大型项目中的性能差异,给出具体数据",
|
||
expect_tool_call=True,
|
||
),
|
||
Probe(
|
||
id=4,
|
||
category="realtime_data_simple",
|
||
query="今天上海天气怎么样?",
|
||
expect_tool_call=True,
|
||
),
|
||
Probe(
|
||
id=5,
|
||
category="no_tool_escape_hatch",
|
||
query="请帮我总结下面这段文字:人工智能(AI)是计算机科学的一个分支,"
|
||
"它企图了解智能的实质,并生产出一种新的能以人类智能相似的方式做出"
|
||
"反应的智能机器。AI 的研究包括机器人、语言识别、图像识别、自然语言"
|
||
"处理和专家系统等。",
|
||
expect_tool_call=False,
|
||
),
|
||
]
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Probe runner
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def _count_tool_calls(result: ReActResult) -> tuple[int, list[str]]:
|
||
"""统计 trajectory 中的 tool_call 步骤,返回 (count, tool_names)。"""
|
||
tool_names: list[str] = []
|
||
for step in result.trajectory:
|
||
if step.action == "tool_call" and step.tool_name:
|
||
tool_names.append(step.tool_name)
|
||
return len(tool_names), tool_names
|
||
|
||
|
||
def _format_trajectory(steps: list[ReActStep]) -> str:
|
||
"""格式化 trajectory 用于报告输出。"""
|
||
lines: list[str] = []
|
||
for s in steps:
|
||
if s.action == "tool_call":
|
||
args_preview = str(s.arguments)[:80] if s.arguments else ""
|
||
lines.append(f" [{s.step}] tool_call: {s.tool_name}({args_preview})")
|
||
elif s.action == "final_answer":
|
||
preview = (s.content or "")[:120].replace("\n", " ")
|
||
lines.append(f" [{s.step}] final_answer: {preview}...")
|
||
else:
|
||
lines.append(f" [{s.step}] {s.action}")
|
||
return "\n".join(lines) if lines else " (empty)"
|
||
|
||
|
||
async def run_probe(engine: ReActEngine, probe: Probe) -> dict:
|
||
"""运行单个 probe,返回结果字典。"""
|
||
print(f"\n{'='*60}")
|
||
print(f"Probe #{probe.id} [{probe.category}]")
|
||
print(f"Query: {probe.query[:80]}{'...' if len(probe.query) > 80 else ''}")
|
||
print(f"Expect tool_call: {probe.expect_tool_call}")
|
||
print(f"{'-'*60}")
|
||
|
||
messages = [{"role": "user", "content": probe.query}]
|
||
start = time.monotonic()
|
||
|
||
try:
|
||
result = await engine.execute(
|
||
messages=messages,
|
||
tools=[WebSearchTool()],
|
||
model="default",
|
||
agent_name="l4_smoke",
|
||
task_type="smoke_test",
|
||
)
|
||
except Exception as e:
|
||
elapsed = time.monotonic() - start
|
||
print(f"ERROR after {elapsed:.1f}s: {type(e).__name__}: {e}")
|
||
return {
|
||
"probe_id": probe.id,
|
||
"category": probe.category,
|
||
"expect_tool_call": probe.expect_tool_call,
|
||
"actual_tool_calls": 0,
|
||
"tool_names": [],
|
||
"status": "error",
|
||
"error": f"{type(e).__name__}: {e}",
|
||
"elapsed_s": elapsed,
|
||
"output_preview": "",
|
||
"trajectory": "",
|
||
}
|
||
|
||
elapsed = time.monotonic() - start
|
||
tool_count, tool_names = _count_tool_calls(result)
|
||
|
||
# 判定:实际触发工具 == 期望触发工具
|
||
actual_triggered = tool_count > 0
|
||
passed = actual_triggered == probe.expect_tool_call
|
||
|
||
# 特例:query 5 期望无工具,但如果 LLM 调用了,也算"行为可观察"(只是不符 escape hatch 期望)
|
||
status = "pass" if passed else "fail"
|
||
|
||
output_preview = (result.output or "")[:200].replace("\n", " ")
|
||
traj_str = _format_trajectory(result.trajectory)
|
||
|
||
print(f"Status: {status.upper()} ({elapsed:.1f}s, {result.total_steps} steps)")
|
||
print(f"Tool calls: {tool_count} {tool_names}")
|
||
print(f"Output preview: {output_preview}")
|
||
print(f"Trajectory:\n{traj_str}")
|
||
|
||
return {
|
||
"probe_id": probe.id,
|
||
"category": probe.category,
|
||
"expect_tool_call": probe.expect_tool_call,
|
||
"actual_tool_calls": tool_count,
|
||
"tool_names": tool_names,
|
||
"status": status,
|
||
"error": None,
|
||
"elapsed_s": elapsed,
|
||
"output_preview": output_preview,
|
||
"trajectory": traj_str,
|
||
"total_steps": result.total_steps,
|
||
"total_tokens": result.total_tokens,
|
||
"react_status": result.status,
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Main
|
||
# ---------------------------------------------------------------------------
|
||
|
||
async def main() -> int:
|
||
"""运行所有 probe,输出报告,返回退出码(0=pass, 1=fail)。"""
|
||
print("=" * 60)
|
||
print("ReAct L4 Smoke Test (U6)")
|
||
print("Verifies U4 L0 prompt rule rearrangement under real LLM calls")
|
||
print("=" * 60)
|
||
|
||
# 1. 加载 server config(从 agentkit.yaml + env)
|
||
print("\n[1/3] Loading server config...")
|
||
try:
|
||
config_path = find_config_path()
|
||
if not config_path:
|
||
print("FATAL: no agentkit.yaml found (./agentkit.yaml or ~/.agentkit/agentkit.yaml)")
|
||
return 2
|
||
server_config = load_config_with_dotenv(config_path)
|
||
except Exception as e:
|
||
print(f"FATAL: failed to load config: {type(e).__name__}: {e}")
|
||
return 2
|
||
|
||
if not server_config.llm_config.providers:
|
||
print("FATAL: no LLM providers configured (check agentkit.yaml / .env)")
|
||
return 2
|
||
|
||
providers_with_key = [
|
||
name for name, p in server_config.llm_config.providers.items()
|
||
if p.api_key
|
||
]
|
||
if not providers_with_key:
|
||
print("FATAL: no LLM providers with api_key set")
|
||
print("Set env vars (e.g. OPENAI_API_KEY) or configure agentkit.yaml")
|
||
return 2
|
||
|
||
print(f" Providers with key: {providers_with_key}")
|
||
print(f" Default model alias: {server_config.llm_config.model_aliases.get('default', '<unset>')}")
|
||
|
||
# 2. 构建 LLM gateway + ReAct engine
|
||
print("\n[2/3] Building LLM gateway + ReAct engine...")
|
||
from agentkit.cli.chat import _build_gateway
|
||
gateway = _build_gateway(server_config)
|
||
if not gateway.has_providers:
|
||
print("FATAL: gateway has no registered providers")
|
||
return 2
|
||
print(f" Gateway providers: {list(gateway._providers.keys())}")
|
||
|
||
engine = ReActEngine(
|
||
llm_gateway=gateway,
|
||
max_steps=8, # 留够步数让 LLM 多轮搜索
|
||
default_timeout=120.0, # 单 query 上限 2 分钟
|
||
enable_tool_search=False, # 简化:直接注入 web_search 完整描述
|
||
)
|
||
|
||
# 3. 运行所有 probe
|
||
print(f"\n[3/3] Running {len(PROBES)} probes...")
|
||
results: list[dict] = []
|
||
for probe in PROBES:
|
||
# 每次重置 engine 状态(避免 loop detection 跨 query 误判)
|
||
engine.reset()
|
||
result = await run_probe(engine, probe)
|
||
results.append(result)
|
||
|
||
# 4. 汇总报告
|
||
print("\n" + "=" * 60)
|
||
print("SUMMARY")
|
||
print("=" * 60)
|
||
print(f"{'#':>3} {'Category':<24} {'Expect':>8} {'Actual':>8} {'Status':>6} {'Time':>6}")
|
||
print("-" * 60)
|
||
for r in results:
|
||
expect_str = "tool" if r["expect_tool_call"] else "direct"
|
||
actual_str = f"{r['actual_tool_calls']} tool" if r["actual_tool_calls"] > 0 else "direct"
|
||
print(
|
||
f"{r['probe_id']:>3} "
|
||
f"{r['category']:<24} "
|
||
f"{expect_str:>8} "
|
||
f"{actual_str:>8} "
|
||
f"{r['status']:>6} "
|
||
f"{r['elapsed_s']:>5.1f}s"
|
||
)
|
||
|
||
# 5. 判定
|
||
print("\n" + "-" * 60)
|
||
tool_probes = [r for r in results if r["expect_tool_call"]]
|
||
direct_probes = [r for r in results if not r["expect_tool_call"]]
|
||
|
||
tool_pass = sum(1 for r in tool_probes if r["status"] == "pass")
|
||
direct_pass = sum(1 for r in direct_probes if r["status"] == "pass")
|
||
|
||
print(f"Tool-call probes: {tool_pass}/{len(tool_probes)} passed (threshold: ≥3/4)")
|
||
print(f"Direct-answer probes: {direct_pass}/{len(direct_probes)} passed")
|
||
|
||
overall_pass = (tool_pass >= 3) and (direct_pass == len(direct_probes))
|
||
|
||
print("\n" + "=" * 60)
|
||
if overall_pass:
|
||
print("VERDICT: PASS — Bug 2 status upgraded to 'L4 verified'")
|
||
print("Action: update plan Progress table U6 → done")
|
||
return 0
|
||
else:
|
||
print("VERDICT: FAIL — L0 rule rearrangement insufficient")
|
||
print("Action: trigger L1 (web_search description expansion) as independent plan")
|
||
print(" also consider L2 (PLAN_EXEC phase policy) if L1 alone insufficient")
|
||
return 1
|
||
|
||
|
||
if __name__ == "__main__":
|
||
sys.exit(asyncio.run(main()))
|