From 53347ed1feef7f704fd45c53741a91ebf4bdeb93 Mon Sep 17 00:00:00 2001 From: chiguyong Date: Thu, 2 Jul 2026 22:08:45 +0800 Subject: [PATCH] test(u6): add L4 real-LLM smoke test for ReAct tool-use prompt Manual smoke test verifying U4 L0 prompt rule rearrangement under real LLM calls (bailian-coding/qwen3.7-plus). 5 probe queries covering external_info / realtime_data / multi_step / realtime_simple / no_tool. Results: - Probe #1 external_info: PASS (8 web_search calls, 99.9s) - Probe #2 realtime_data: ERROR (120s timeout, not LLM refusal) - Probe #3 multi_step: PASS (8 web_search calls, 62.6s) - Probe #4 realtime_data_simple: PASS (3 web_search calls, 23.8s) - Probe #5 no_tool_escape_hatch: PASS (0 tool calls, direct answer, 4.2s) Verdict: 3/4 tool-call pass (>=3/4 threshold) + 1/1 direct pass Bug 2 status upgraded to 'L4 verified'. Plan Progress table updated: U6 done, U7 done. --- ...tate-reset-and-react-tool-guidance-plan.md | 16 +- tests/manual/test_react_l4_smoke.py | 295 ++++++++++++++++++ 2 files changed, 308 insertions(+), 3 deletions(-) create mode 100644 tests/manual/test_react_l4_smoke.py diff --git a/docs/plans/2026-07-02-002-fix-transient-state-reset-and-react-tool-guidance-plan.md b/docs/plans/2026-07-02-002-fix-transient-state-reset-and-react-tool-guidance-plan.md index 29d2eb5..8de8fab 100644 --- a/docs/plans/2026-07-02-002-fix-transient-state-reset-and-react-tool-guidance-plan.md +++ b/docs/plans/2026-07-02-002-fix-transient-state-reset-and-react-tool-guidance-plan.md @@ -18,12 +18,22 @@ status: in-progress | U3 deleteConversation 补全 | done | 5 前端单测 pass | 7376005 | | U4 ReAct prompt 规则重排 | done | 6 后端单测 pass | 7376005 | | U5 端到端验证测试 | done | 11 单测全 pass | 7376005 | -| U6 Bug 2 L4 真实 LLM smoke test | pending | — | — | -| U7 工作树未提交变更清理 | pending | — | — | +| U6 Bug 2 L4 真实 LLM smoke test | done | 3/4 tool-call + 1/1 direct pass | (本 commit) | +| U7 工作树未提交变更清理 | done | git status 干净 + vitest 138/139 pass | 9e2ccf5..44f4f1c | + +Bug 2 状态:L4 verified(L0 规则重排在真实 LLM 调用下生效) + +L4 smoke test 结果(2026-07-02,bailian-coding/qwen3.7-plus): +- Probe #1 external_info: PASS(8 次 web_search 调用,99.9s) +- Probe #2 realtime_data: ERROR(120s 超时,非 LLM 不调用工具) +- Probe #3 multi_step: PASS(8 次 web_search 调用,62.6s) +- Probe #4 realtime_data_simple: PASS(3 次 web_search 调用,23.8s) +- Probe #5 no_tool_escape_hatch: PASS(0 次工具调用,直接回答,4.2s) +- 判定:3/4 tool-call pass(达阈值 ≥3/4)+ 1/1 direct pass → L4 verified PR: http://8.153.107.96/gitea/fischer/fischer-agentkit/pulls/17 ce-code-review: mode:agent, 无 actionable findings -ce-test-browser: agent-browser 已安装,待 U6 阶段执行 +ce-test-browser: agent-browser 已安装(U6 用脚本直接验证,未走前端) ## Problem Frame diff --git a/tests/manual/test_react_l4_smoke.py b/tests/manual/test_react_l4_smoke.py new file mode 100644 index 0000000..5148ffb --- /dev/null +++ b/tests/manual/test_react_l4_smoke.py @@ -0,0 +1,295 @@ +"""ReAct L4 真实 LLM smoke test (U6). + +手动验证 U4 的 L0 规则重排(`_build_tool_use_prompt` 中"何时必须使用工具"前置) +在真实 LLM 调用下是否生效 — Agent 面对复杂需求时是否调用 `web_search` 而非 +直接回答。 + +不进 CI(依赖真实 LLM API key + 网络)。运行方式: + + python3 tests/manual/test_react_l4_smoke.py + +判定标准(plan U6): +- Probe 1-4 期望触发 web_search(≥3/4 pass 算通过) +- Probe 5 期望不触发工具调用(验证 escape hatch 规则 4 仍有效) +- 通过 → Bug 2 状态升级为 "L4 verified" +- 未通过 → 触发 L1(工具描述扩展)独立 plan + +ponytail: 直接复用 cli.chat._build_gateway + server.app._create_provider, +不重新实现 provider 注册逻辑。升级路径:抽到 shared 模块供 cli/server/manual +共用。 +""" + +from __future__ import annotations + +import asyncio +import sys +import time +from dataclasses import dataclass +from pathlib import Path + +# 确保可以 import agentkit 包(脚本从仓库根目录运行) +ROOT = Path(__file__).resolve().parent.parent.parent +sys.path.insert(0, str(ROOT / "src")) + +from agentkit.core.react import ReActEngine, ReActResult, ReActStep # noqa: E402 +from agentkit.server.config import find_config_path, load_config_with_dotenv # noqa: E402 +from agentkit.tools.web_search import WebSearchTool # noqa: E402 + + +# --------------------------------------------------------------------------- +# Probe queries +# --------------------------------------------------------------------------- + +@dataclass +class Probe: + """单个 probe query 及其期望行为。""" + id: int + category: str # external_info / realtime_data / multi_step / uncertain / no_tool + query: str + expect_tool_call: bool # True: 期望触发 web_search;False: 期望直接回答 + + +PROBES: list[Probe] = [ + Probe( + id=1, + category="external_info", + query="收集 GitHub Trending 前 10 个项目信息并分析商业价值", + expect_tool_call=True, + ), + Probe( + id=2, + category="realtime_data", + query="最新 AI 领域有什么重要新闻?", + expect_tool_call=True, + ), + Probe( + id=3, + category="multi_step", + query="对比 React 和 Vue 3 在大型项目中的性能差异,给出具体数据", + expect_tool_call=True, + ), + Probe( + id=4, + category="realtime_data_simple", + query="今天上海天气怎么样?", + expect_tool_call=True, + ), + Probe( + id=5, + category="no_tool_escape_hatch", + query="请帮我总结下面这段文字:人工智能(AI)是计算机科学的一个分支," + "它企图了解智能的实质,并生产出一种新的能以人类智能相似的方式做出" + "反应的智能机器。AI 的研究包括机器人、语言识别、图像识别、自然语言" + "处理和专家系统等。", + expect_tool_call=False, + ), +] + + +# --------------------------------------------------------------------------- +# Probe runner +# --------------------------------------------------------------------------- + +def _count_tool_calls(result: ReActResult) -> tuple[int, list[str]]: + """统计 trajectory 中的 tool_call 步骤,返回 (count, tool_names)。""" + tool_names: list[str] = [] + for step in result.trajectory: + if step.action == "tool_call" and step.tool_name: + tool_names.append(step.tool_name) + return len(tool_names), tool_names + + +def _format_trajectory(steps: list[ReActStep]) -> str: + """格式化 trajectory 用于报告输出。""" + lines: list[str] = [] + for s in steps: + if s.action == "tool_call": + args_preview = str(s.arguments)[:80] if s.arguments else "" + lines.append(f" [{s.step}] tool_call: {s.tool_name}({args_preview})") + elif s.action == "final_answer": + preview = (s.content or "")[:120].replace("\n", " ") + lines.append(f" [{s.step}] final_answer: {preview}...") + else: + lines.append(f" [{s.step}] {s.action}") + return "\n".join(lines) if lines else " (empty)" + + +async def run_probe(engine: ReActEngine, probe: Probe) -> dict: + """运行单个 probe,返回结果字典。""" + print(f"\n{'='*60}") + print(f"Probe #{probe.id} [{probe.category}]") + print(f"Query: {probe.query[:80]}{'...' if len(probe.query) > 80 else ''}") + print(f"Expect tool_call: {probe.expect_tool_call}") + print(f"{'-'*60}") + + messages = [{"role": "user", "content": probe.query}] + start = time.monotonic() + + try: + result = await engine.execute( + messages=messages, + tools=[WebSearchTool()], + model="default", + agent_name="l4_smoke", + task_type="smoke_test", + ) + except Exception as e: + elapsed = time.monotonic() - start + print(f"ERROR after {elapsed:.1f}s: {type(e).__name__}: {e}") + return { + "probe_id": probe.id, + "category": probe.category, + "expect_tool_call": probe.expect_tool_call, + "actual_tool_calls": 0, + "tool_names": [], + "status": "error", + "error": f"{type(e).__name__}: {e}", + "elapsed_s": elapsed, + "output_preview": "", + "trajectory": "", + } + + elapsed = time.monotonic() - start + tool_count, tool_names = _count_tool_calls(result) + + # 判定:实际触发工具 == 期望触发工具 + actual_triggered = tool_count > 0 + passed = actual_triggered == probe.expect_tool_call + + # 特例:query 5 期望无工具,但如果 LLM 调用了,也算"行为可观察"(只是不符 escape hatch 期望) + status = "pass" if passed else "fail" + + output_preview = (result.output or "")[:200].replace("\n", " ") + traj_str = _format_trajectory(result.trajectory) + + print(f"Status: {status.upper()} ({elapsed:.1f}s, {result.total_steps} steps)") + print(f"Tool calls: {tool_count} {tool_names}") + print(f"Output preview: {output_preview}") + print(f"Trajectory:\n{traj_str}") + + return { + "probe_id": probe.id, + "category": probe.category, + "expect_tool_call": probe.expect_tool_call, + "actual_tool_calls": tool_count, + "tool_names": tool_names, + "status": status, + "error": None, + "elapsed_s": elapsed, + "output_preview": output_preview, + "trajectory": traj_str, + "total_steps": result.total_steps, + "total_tokens": result.total_tokens, + "react_status": result.status, + } + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +async def main() -> int: + """运行所有 probe,输出报告,返回退出码(0=pass, 1=fail)。""" + print("=" * 60) + print("ReAct L4 Smoke Test (U6)") + print("Verifies U4 L0 prompt rule rearrangement under real LLM calls") + print("=" * 60) + + # 1. 加载 server config(从 agentkit.yaml + env) + print("\n[1/3] Loading server config...") + try: + config_path = find_config_path() + if not config_path: + print("FATAL: no agentkit.yaml found (./agentkit.yaml or ~/.agentkit/agentkit.yaml)") + return 2 + server_config = load_config_with_dotenv(config_path) + except Exception as e: + print(f"FATAL: failed to load config: {type(e).__name__}: {e}") + return 2 + + if not server_config.llm_config.providers: + print("FATAL: no LLM providers configured (check agentkit.yaml / .env)") + return 2 + + providers_with_key = [ + name for name, p in server_config.llm_config.providers.items() + if p.api_key + ] + if not providers_with_key: + print("FATAL: no LLM providers with api_key set") + print("Set env vars (e.g. OPENAI_API_KEY) or configure agentkit.yaml") + return 2 + + print(f" Providers with key: {providers_with_key}") + print(f" Default model alias: {server_config.llm_config.model_aliases.get('default', '')}") + + # 2. 构建 LLM gateway + ReAct engine + print("\n[2/3] Building LLM gateway + ReAct engine...") + from agentkit.cli.chat import _build_gateway + gateway = _build_gateway(server_config) + if not gateway.has_providers: + print("FATAL: gateway has no registered providers") + return 2 + print(f" Gateway providers: {list(gateway._providers.keys())}") + + engine = ReActEngine( + llm_gateway=gateway, + max_steps=8, # 留够步数让 LLM 多轮搜索 + default_timeout=120.0, # 单 query 上限 2 分钟 + enable_tool_search=False, # 简化:直接注入 web_search 完整描述 + ) + + # 3. 运行所有 probe + print(f"\n[3/3] Running {len(PROBES)} probes...") + results: list[dict] = [] + for probe in PROBES: + # 每次重置 engine 状态(避免 loop detection 跨 query 误判) + engine.reset() + result = await run_probe(engine, probe) + results.append(result) + + # 4. 汇总报告 + print("\n" + "=" * 60) + print("SUMMARY") + print("=" * 60) + print(f"{'#':>3} {'Category':<24} {'Expect':>8} {'Actual':>8} {'Status':>6} {'Time':>6}") + print("-" * 60) + for r in results: + expect_str = "tool" if r["expect_tool_call"] else "direct" + actual_str = f"{r['actual_tool_calls']} tool" if r["actual_tool_calls"] > 0 else "direct" + print( + f"{r['probe_id']:>3} " + f"{r['category']:<24} " + f"{expect_str:>8} " + f"{actual_str:>8} " + f"{r['status']:>6} " + f"{r['elapsed_s']:>5.1f}s" + ) + + # 5. 判定 + print("\n" + "-" * 60) + tool_probes = [r for r in results if r["expect_tool_call"]] + direct_probes = [r for r in results if not r["expect_tool_call"]] + + tool_pass = sum(1 for r in tool_probes if r["status"] == "pass") + direct_pass = sum(1 for r in direct_probes if r["status"] == "pass") + + print(f"Tool-call probes: {tool_pass}/{len(tool_probes)} passed (threshold: ≥3/4)") + print(f"Direct-answer probes: {direct_pass}/{len(direct_probes)} passed") + + overall_pass = (tool_pass >= 3) and (direct_pass == len(direct_probes)) + + print("\n" + "=" * 60) + if overall_pass: + print("VERDICT: PASS — Bug 2 status upgraded to 'L4 verified'") + print("Action: update plan Progress table U6 → done") + return 0 + else: + print("VERDICT: FAIL — L0 rule rearrangement insufficient") + print("Action: trigger L1 (web_search description expansion) as independent plan") + print(" also consider L2 (PLAN_EXEC phase policy) if L1 alone insufficient") + return 1 + + +if __name__ == "__main__": + sys.exit(asyncio.run(main()))