"""ReAct L4 真实 LLM smoke test (U6). 手动验证 U4 的 L0 规则重排(`_build_tool_use_prompt` 中"何时必须使用工具"前置) 在真实 LLM 调用下是否生效 — Agent 面对复杂需求时是否调用 `web_search` 而非 直接回答。 不进 CI(依赖真实 LLM API key + 网络)。运行方式: python3 tests/manual/test_react_l4_smoke.py 判定标准(plan U6): - Probe 1-4 期望触发 web_search(≥3/4 pass 算通过) - Probe 5 期望不触发工具调用(验证 escape hatch 规则 4 仍有效) - 通过 → Bug 2 状态升级为 "L4 verified" - 未通过 → 触发 L1(工具描述扩展)独立 plan ponytail: 直接复用 cli.chat._build_gateway + server.app._create_provider, 不重新实现 provider 注册逻辑。升级路径:抽到 shared 模块供 cli/server/manual 共用。 """ from __future__ import annotations import asyncio import sys import time from dataclasses import dataclass from pathlib import Path # 确保可以 import agentkit 包(脚本从仓库根目录运行) ROOT = Path(__file__).resolve().parent.parent.parent sys.path.insert(0, str(ROOT / "src")) from agentkit.core.react import ReActEngine, ReActResult, ReActStep # noqa: E402 from agentkit.server.config import find_config_path, load_config_with_dotenv # noqa: E402 from agentkit.tools.web_search import WebSearchTool # noqa: E402 # --------------------------------------------------------------------------- # Probe queries # --------------------------------------------------------------------------- @dataclass class Probe: """单个 probe query 及其期望行为。""" id: int category: str # external_info / realtime_data / multi_step / uncertain / no_tool query: str expect_tool_call: bool # True: 期望触发 web_search;False: 期望直接回答 PROBES: list[Probe] = [ Probe( id=1, category="external_info", query="收集 GitHub Trending 前 10 个项目信息并分析商业价值", expect_tool_call=True, ), Probe( id=2, category="realtime_data", query="最新 AI 领域有什么重要新闻?", expect_tool_call=True, ), Probe( id=3, category="multi_step", query="对比 React 和 Vue 3 在大型项目中的性能差异,给出具体数据", expect_tool_call=True, ), Probe( id=4, category="realtime_data_simple", query="今天上海天气怎么样?", expect_tool_call=True, ), Probe( id=5, category="no_tool_escape_hatch", query="请帮我总结下面这段文字:人工智能(AI)是计算机科学的一个分支," "它企图了解智能的实质,并生产出一种新的能以人类智能相似的方式做出" "反应的智能机器。AI 的研究包括机器人、语言识别、图像识别、自然语言" "处理和专家系统等。", expect_tool_call=False, ), ] # --------------------------------------------------------------------------- # Probe runner # --------------------------------------------------------------------------- def _count_tool_calls(result: ReActResult) -> tuple[int, list[str]]: """统计 trajectory 中的 tool_call 步骤,返回 (count, tool_names)。""" tool_names: list[str] = [] for step in result.trajectory: if step.action == "tool_call" and step.tool_name: tool_names.append(step.tool_name) return len(tool_names), tool_names def _format_trajectory(steps: list[ReActStep]) -> str: """格式化 trajectory 用于报告输出。""" lines: list[str] = [] for s in steps: if s.action == "tool_call": args_preview = str(s.arguments)[:80] if s.arguments else "" lines.append(f" [{s.step}] tool_call: {s.tool_name}({args_preview})") elif s.action == "final_answer": preview = (s.content or "")[:120].replace("\n", " ") lines.append(f" [{s.step}] final_answer: {preview}...") else: lines.append(f" [{s.step}] {s.action}") return "\n".join(lines) if lines else " (empty)" async def run_probe(engine: ReActEngine, probe: Probe) -> dict: """运行单个 probe,返回结果字典。""" print(f"\n{'='*60}") print(f"Probe #{probe.id} [{probe.category}]") print(f"Query: {probe.query[:80]}{'...' if len(probe.query) > 80 else ''}") print(f"Expect tool_call: {probe.expect_tool_call}") print(f"{'-'*60}") messages = [{"role": "user", "content": probe.query}] start = time.monotonic() try: result = await engine.execute( messages=messages, tools=[WebSearchTool()], model="default", agent_name="l4_smoke", task_type="smoke_test", ) except Exception as e: elapsed = time.monotonic() - start print(f"ERROR after {elapsed:.1f}s: {type(e).__name__}: {e}") return { "probe_id": probe.id, "category": probe.category, "expect_tool_call": probe.expect_tool_call, "actual_tool_calls": 0, "tool_names": [], "status": "error", "error": f"{type(e).__name__}: {e}", "elapsed_s": elapsed, "output_preview": "", "trajectory": "", } elapsed = time.monotonic() - start tool_count, tool_names = _count_tool_calls(result) # 判定:实际触发工具 == 期望触发工具 actual_triggered = tool_count > 0 passed = actual_triggered == probe.expect_tool_call # 特例:query 5 期望无工具,但如果 LLM 调用了,也算"行为可观察"(只是不符 escape hatch 期望) status = "pass" if passed else "fail" output_preview = (result.output or "")[:200].replace("\n", " ") traj_str = _format_trajectory(result.trajectory) print(f"Status: {status.upper()} ({elapsed:.1f}s, {result.total_steps} steps)") print(f"Tool calls: {tool_count} {tool_names}") print(f"Output preview: {output_preview}") print(f"Trajectory:\n{traj_str}") return { "probe_id": probe.id, "category": probe.category, "expect_tool_call": probe.expect_tool_call, "actual_tool_calls": tool_count, "tool_names": tool_names, "status": status, "error": None, "elapsed_s": elapsed, "output_preview": output_preview, "trajectory": traj_str, "total_steps": result.total_steps, "total_tokens": result.total_tokens, "react_status": result.status, } # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- async def main() -> int: """运行所有 probe,输出报告,返回退出码(0=pass, 1=fail)。""" print("=" * 60) print("ReAct L4 Smoke Test (U6)") print("Verifies U4 L0 prompt rule rearrangement under real LLM calls") print("=" * 60) # 1. 加载 server config(从 agentkit.yaml + env) print("\n[1/3] Loading server config...") try: config_path = find_config_path() if not config_path: print("FATAL: no agentkit.yaml found (./agentkit.yaml or ~/.agentkit/agentkit.yaml)") return 2 server_config = load_config_with_dotenv(config_path) except Exception as e: print(f"FATAL: failed to load config: {type(e).__name__}: {e}") return 2 if not server_config.llm_config.providers: print("FATAL: no LLM providers configured (check agentkit.yaml / .env)") return 2 providers_with_key = [ name for name, p in server_config.llm_config.providers.items() if p.api_key ] if not providers_with_key: print("FATAL: no LLM providers with api_key set") print("Set env vars (e.g. OPENAI_API_KEY) or configure agentkit.yaml") return 2 print(f" Providers with key: {providers_with_key}") print(f" Default model alias: {server_config.llm_config.model_aliases.get('default', '')}") # 2. 构建 LLM gateway + ReAct engine print("\n[2/3] Building LLM gateway + ReAct engine...") from agentkit.cli.chat import _build_gateway gateway = _build_gateway(server_config) if not gateway.has_providers: print("FATAL: gateway has no registered providers") return 2 print(f" Gateway providers: {list(gateway._providers.keys())}") engine = ReActEngine( llm_gateway=gateway, max_steps=8, # 留够步数让 LLM 多轮搜索 default_timeout=120.0, # 单 query 上限 2 分钟 enable_tool_search=False, # 简化:直接注入 web_search 完整描述 ) # 3. 运行所有 probe print(f"\n[3/3] Running {len(PROBES)} probes...") results: list[dict] = [] for probe in PROBES: # 每次重置 engine 状态(避免 loop detection 跨 query 误判) engine.reset() result = await run_probe(engine, probe) results.append(result) # 4. 汇总报告 print("\n" + "=" * 60) print("SUMMARY") print("=" * 60) print(f"{'#':>3} {'Category':<24} {'Expect':>8} {'Actual':>8} {'Status':>6} {'Time':>6}") print("-" * 60) for r in results: expect_str = "tool" if r["expect_tool_call"] else "direct" actual_str = f"{r['actual_tool_calls']} tool" if r["actual_tool_calls"] > 0 else "direct" print( f"{r['probe_id']:>3} " f"{r['category']:<24} " f"{expect_str:>8} " f"{actual_str:>8} " f"{r['status']:>6} " f"{r['elapsed_s']:>5.1f}s" ) # 5. 判定 print("\n" + "-" * 60) tool_probes = [r for r in results if r["expect_tool_call"]] direct_probes = [r for r in results if not r["expect_tool_call"]] tool_pass = sum(1 for r in tool_probes if r["status"] == "pass") direct_pass = sum(1 for r in direct_probes if r["status"] == "pass") print(f"Tool-call probes: {tool_pass}/{len(tool_probes)} passed (threshold: ≥3/4)") print(f"Direct-answer probes: {direct_pass}/{len(direct_probes)} passed") overall_pass = (tool_pass >= 3) and (direct_pass == len(direct_probes)) print("\n" + "=" * 60) if overall_pass: print("VERDICT: PASS — Bug 2 status upgraded to 'L4 verified'") print("Action: update plan Progress table U6 → done") return 0 else: print("VERDICT: FAIL — L0 rule rearrangement insufficient") print("Action: trigger L1 (web_search description expansion) as independent plan") print(" also consider L2 (PLAN_EXEC phase policy) if L1 alone insufficient") return 1 if __name__ == "__main__": sys.exit(asyncio.run(main()))