From 53347ed1feef7f704fd45c53741a91ebf4bdeb93 Mon Sep 17 00:00:00 2001
From: chiguyong <chiguyong@beyondsoft.com>
Date: Thu, 2 Jul 2026 22:08:45 +0800
Subject: [PATCH] test(u6): add L4 real-LLM smoke test for ReAct tool-use
 prompt

Manual smoke test verifying U4 L0 prompt rule rearrangement under real
LLM calls (bailian-coding/qwen3.7-plus). 5 probe queries covering
external_info / realtime_data / multi_step / realtime_simple / no_tool.

Results:
- Probe #1 external_info: PASS (8 web_search calls, 99.9s)
- Probe #2 realtime_data: ERROR (120s timeout, not LLM refusal)
- Probe #3 multi_step: PASS (8 web_search calls, 62.6s)
- Probe #4 realtime_data_simple: PASS (3 web_search calls, 23.8s)
- Probe #5 no_tool_escape_hatch: PASS (0 tool calls, direct answer, 4.2s)

Verdict: 3/4 tool-call pass (>=3/4 threshold) + 1/1 direct pass
Bug 2 status upgraded to 'L4 verified'.

Plan Progress table updated: U6 done, U7 done.
---
 ...tate-reset-and-react-tool-guidance-plan.md |  16 +-
 tests/manual/test_react_l4_smoke.py           | 295 ++++++++++++++++++
 2 files changed, 308 insertions(+), 3 deletions(-)
 create mode 100644 tests/manual/test_react_l4_smoke.py

diff --git a/docs/plans/2026-07-02-002-fix-transient-state-reset-and-react-tool-guidance-plan.md b/docs/plans/2026-07-02-002-fix-transient-state-reset-and-react-tool-guidance-plan.md
index 29d2eb5..8de8fab 100644
--- a/docs/plans/2026-07-02-002-fix-transient-state-reset-and-react-tool-guidance-plan.md
+++ b/docs/plans/2026-07-02-002-fix-transient-state-reset-and-react-tool-guidance-plan.md
@@ -18,12 +18,22 @@ status: in-progress
 | U3 deleteConversation 补全 | done | 5 前端单测 pass | 7376005 |
 | U4 ReAct prompt 规则重排 | done | 6 后端单测 pass | 7376005 |
 | U5 端到端验证测试 | done | 11 单测全 pass | 7376005 |
-| U6 Bug 2 L4 真实 LLM smoke test | pending | — | — |
-| U7 工作树未提交变更清理 | pending | — | — |
+| U6 Bug 2 L4 真实 LLM smoke test | done | 3/4 tool-call + 1/1 direct pass | (本 commit) |
+| U7 工作树未提交变更清理 | done | git status 干净 + vitest 138/139 pass | 9e2ccf5..44f4f1c |
+
+Bug 2 状态：L4 verified（L0 规则重排在真实 LLM 调用下生效）
+
+L4 smoke test 结果（2026-07-02，bailian-coding/qwen3.7-plus）：
+- Probe #1 external_info: PASS（8 次 web_search 调用，99.9s）
+- Probe #2 realtime_data: ERROR（120s 超时，非 LLM 不调用工具）
+- Probe #3 multi_step: PASS（8 次 web_search 调用，62.6s）
+- Probe #4 realtime_data_simple: PASS（3 次 web_search 调用，23.8s）
+- Probe #5 no_tool_escape_hatch: PASS（0 次工具调用，直接回答，4.2s）
+- 判定：3/4 tool-call pass（达阈值 ≥3/4）+ 1/1 direct pass → L4 verified
 
 PR: http://8.153.107.96/gitea/fischer/fischer-agentkit/pulls/17
 ce-code-review: mode:agent, 无 actionable findings
-ce-test-browser: agent-browser 已安装，待 U6 阶段执行
+ce-test-browser: agent-browser 已安装（U6 用脚本直接验证，未走前端）
 
 ## Problem Frame
 
diff --git a/tests/manual/test_react_l4_smoke.py b/tests/manual/test_react_l4_smoke.py
new file mode 100644
index 0000000..5148ffb
--- /dev/null
+++ b/tests/manual/test_react_l4_smoke.py
@@ -0,0 +1,295 @@
+"""ReAct L4 真实 LLM smoke test (U6).
+
+手动验证 U4 的 L0 规则重排（`_build_tool_use_prompt` 中"何时必须使用工具"前置）
+在真实 LLM 调用下是否生效 — Agent 面对复杂需求时是否调用 `web_search` 而非
+直接回答。
+
+不进 CI（依赖真实 LLM API key + 网络）。运行方式：
+
+    python3 tests/manual/test_react_l4_smoke.py
+
+判定标准（plan U6）：
+- Probe 1-4 期望触发 web_search（≥3/4 pass 算通过）
+- Probe 5 期望不触发工具调用（验证 escape hatch 规则 4 仍有效）
+- 通过 → Bug 2 状态升级为 "L4 verified"
+- 未通过 → 触发 L1（工具描述扩展）独立 plan
+
+ponytail: 直接复用 cli.chat._build_gateway + server.app._create_provider，
+不重新实现 provider 注册逻辑。升级路径：抽到 shared 模块供 cli/server/manual
+共用。
+"""
+
+from __future__ import annotations
+
+import asyncio
+import sys
+import time
+from dataclasses import dataclass
+from pathlib import Path
+
+# 确保可以 import agentkit 包（脚本从仓库根目录运行）
+ROOT = Path(__file__).resolve().parent.parent.parent
+sys.path.insert(0, str(ROOT / "src"))
+
+from agentkit.core.react import ReActEngine, ReActResult, ReActStep  # noqa: E402
+from agentkit.server.config import find_config_path, load_config_with_dotenv  # noqa: E402
+from agentkit.tools.web_search import WebSearchTool  # noqa: E402
+
+
+# ---------------------------------------------------------------------------
+# Probe queries
+# ---------------------------------------------------------------------------
+
+@dataclass
+class Probe:
+    """单个 probe query 及其期望行为。"""
+    id: int
+    category: str          # external_info / realtime_data / multi_step / uncertain / no_tool
+    query: str
+    expect_tool_call: bool # True: 期望触发 web_search；False: 期望直接回答
+
+
+PROBES: list[Probe] = [
+    Probe(
+        id=1,
+        category="external_info",
+        query="收集 GitHub Trending 前 10 个项目信息并分析商业价值",
+        expect_tool_call=True,
+    ),
+    Probe(
+        id=2,
+        category="realtime_data",
+        query="最新 AI 领域有什么重要新闻？",
+        expect_tool_call=True,
+    ),
+    Probe(
+        id=3,
+        category="multi_step",
+        query="对比 React 和 Vue 3 在大型项目中的性能差异，给出具体数据",
+        expect_tool_call=True,
+    ),
+    Probe(
+        id=4,
+        category="realtime_data_simple",
+        query="今天上海天气怎么样？",
+        expect_tool_call=True,
+    ),
+    Probe(
+        id=5,
+        category="no_tool_escape_hatch",
+        query="请帮我总结下面这段文字：人工智能（AI）是计算机科学的一个分支，"
+              "它企图了解智能的实质，并生产出一种新的能以人类智能相似的方式做出"
+              "反应的智能机器。AI 的研究包括机器人、语言识别、图像识别、自然语言"
+              "处理和专家系统等。",
+        expect_tool_call=False,
+    ),
+]
+
+
+# ---------------------------------------------------------------------------
+# Probe runner
+# ---------------------------------------------------------------------------
+
+def _count_tool_calls(result: ReActResult) -> tuple[int, list[str]]:
+    """统计 trajectory 中的 tool_call 步骤，返回 (count, tool_names)。"""
+    tool_names: list[str] = []
+    for step in result.trajectory:
+        if step.action == "tool_call" and step.tool_name:
+            tool_names.append(step.tool_name)
+    return len(tool_names), tool_names
+
+
+def _format_trajectory(steps: list[ReActStep]) -> str:
+    """格式化 trajectory 用于报告输出。"""
+    lines: list[str] = []
+    for s in steps:
+        if s.action == "tool_call":
+            args_preview = str(s.arguments)[:80] if s.arguments else ""
+            lines.append(f"    [{s.step}] tool_call: {s.tool_name}({args_preview})")
+        elif s.action == "final_answer":
+            preview = (s.content or "")[:120].replace("\n", " ")
+            lines.append(f"    [{s.step}] final_answer: {preview}...")
+        else:
+            lines.append(f"    [{s.step}] {s.action}")
+    return "\n".join(lines) if lines else "    (empty)"
+
+
+async def run_probe(engine: ReActEngine, probe: Probe) -> dict:
+    """运行单个 probe，返回结果字典。"""
+    print(f"\n{'='*60}")
+    print(f"Probe #{probe.id} [{probe.category}]")
+    print(f"Query: {probe.query[:80]}{'...' if len(probe.query) > 80 else ''}")
+    print(f"Expect tool_call: {probe.expect_tool_call}")
+    print(f"{'-'*60}")
+
+    messages = [{"role": "user", "content": probe.query}]
+    start = time.monotonic()
+
+    try:
+        result = await engine.execute(
+            messages=messages,
+            tools=[WebSearchTool()],
+            model="default",
+            agent_name="l4_smoke",
+            task_type="smoke_test",
+        )
+    except Exception as e:
+        elapsed = time.monotonic() - start
+        print(f"ERROR after {elapsed:.1f}s: {type(e).__name__}: {e}")
+        return {
+            "probe_id": probe.id,
+            "category": probe.category,
+            "expect_tool_call": probe.expect_tool_call,
+            "actual_tool_calls": 0,
+            "tool_names": [],
+            "status": "error",
+            "error": f"{type(e).__name__}: {e}",
+            "elapsed_s": elapsed,
+            "output_preview": "",
+            "trajectory": "",
+        }
+
+    elapsed = time.monotonic() - start
+    tool_count, tool_names = _count_tool_calls(result)
+
+    # 判定：实际触发工具 == 期望触发工具
+    actual_triggered = tool_count > 0
+    passed = actual_triggered == probe.expect_tool_call
+
+    # 特例：query 5 期望无工具，但如果 LLM 调用了，也算"行为可观察"（只是不符 escape hatch 期望）
+    status = "pass" if passed else "fail"
+
+    output_preview = (result.output or "")[:200].replace("\n", " ")
+    traj_str = _format_trajectory(result.trajectory)
+
+    print(f"Status: {status.upper()}  ({elapsed:.1f}s, {result.total_steps} steps)")
+    print(f"Tool calls: {tool_count} {tool_names}")
+    print(f"Output preview: {output_preview}")
+    print(f"Trajectory:\n{traj_str}")
+
+    return {
+        "probe_id": probe.id,
+        "category": probe.category,
+        "expect_tool_call": probe.expect_tool_call,
+        "actual_tool_calls": tool_count,
+        "tool_names": tool_names,
+        "status": status,
+        "error": None,
+        "elapsed_s": elapsed,
+        "output_preview": output_preview,
+        "trajectory": traj_str,
+        "total_steps": result.total_steps,
+        "total_tokens": result.total_tokens,
+        "react_status": result.status,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+
+async def main() -> int:
+    """运行所有 probe，输出报告，返回退出码（0=pass, 1=fail）。"""
+    print("=" * 60)
+    print("ReAct L4 Smoke Test (U6)")
+    print("Verifies U4 L0 prompt rule rearrangement under real LLM calls")
+    print("=" * 60)
+
+    # 1. 加载 server config（从 agentkit.yaml + env）
+    print("\n[1/3] Loading server config...")
+    try:
+        config_path = find_config_path()
+        if not config_path:
+            print("FATAL: no agentkit.yaml found (./agentkit.yaml or ~/.agentkit/agentkit.yaml)")
+            return 2
+        server_config = load_config_with_dotenv(config_path)
+    except Exception as e:
+        print(f"FATAL: failed to load config: {type(e).__name__}: {e}")
+        return 2
+
+    if not server_config.llm_config.providers:
+        print("FATAL: no LLM providers configured (check agentkit.yaml / .env)")
+        return 2
+
+    providers_with_key = [
+        name for name, p in server_config.llm_config.providers.items()
+        if p.api_key
+    ]
+    if not providers_with_key:
+        print("FATAL: no LLM providers with api_key set")
+        print("Set env vars (e.g. OPENAI_API_KEY) or configure agentkit.yaml")
+        return 2
+
+    print(f"  Providers with key: {providers_with_key}")
+    print(f"  Default model alias: {server_config.llm_config.model_aliases.get('default', '<unset>')}")
+
+    # 2. 构建 LLM gateway + ReAct engine
+    print("\n[2/3] Building LLM gateway + ReAct engine...")
+    from agentkit.cli.chat import _build_gateway
+    gateway = _build_gateway(server_config)
+    if not gateway.has_providers:
+        print("FATAL: gateway has no registered providers")
+        return 2
+    print(f"  Gateway providers: {list(gateway._providers.keys())}")
+
+    engine = ReActEngine(
+        llm_gateway=gateway,
+        max_steps=8,           # 留够步数让 LLM 多轮搜索
+        default_timeout=120.0, # 单 query 上限 2 分钟
+        enable_tool_search=False,  # 简化：直接注入 web_search 完整描述
+    )
+
+    # 3. 运行所有 probe
+    print(f"\n[3/3] Running {len(PROBES)} probes...")
+    results: list[dict] = []
+    for probe in PROBES:
+        # 每次重置 engine 状态（避免 loop detection 跨 query 误判）
+        engine.reset()
+        result = await run_probe(engine, probe)
+        results.append(result)
+
+    # 4. 汇总报告
+    print("\n" + "=" * 60)
+    print("SUMMARY")
+    print("=" * 60)
+    print(f"{'#':>3}  {'Category':<24} {'Expect':>8} {'Actual':>8}  {'Status':>6}  {'Time':>6}")
+    print("-" * 60)
+    for r in results:
+        expect_str = "tool" if r["expect_tool_call"] else "direct"
+        actual_str = f"{r['actual_tool_calls']} tool" if r["actual_tool_calls"] > 0 else "direct"
+        print(
+            f"{r['probe_id']:>3}  "
+            f"{r['category']:<24} "
+            f"{expect_str:>8} "
+            f"{actual_str:>8}  "
+            f"{r['status']:>6}  "
+            f"{r['elapsed_s']:>5.1f}s"
+        )
+
+    # 5. 判定
+    print("\n" + "-" * 60)
+    tool_probes = [r for r in results if r["expect_tool_call"]]
+    direct_probes = [r for r in results if not r["expect_tool_call"]]
+
+    tool_pass = sum(1 for r in tool_probes if r["status"] == "pass")
+    direct_pass = sum(1 for r in direct_probes if r["status"] == "pass")
+
+    print(f"Tool-call probes: {tool_pass}/{len(tool_probes)} passed (threshold: ≥3/4)")
+    print(f"Direct-answer probes: {direct_pass}/{len(direct_probes)} passed")
+
+    overall_pass = (tool_pass >= 3) and (direct_pass == len(direct_probes))
+
+    print("\n" + "=" * 60)
+    if overall_pass:
+        print("VERDICT: PASS — Bug 2 status upgraded to 'L4 verified'")
+        print("Action: update plan Progress table U6 → done")
+        return 0
+    else:
+        print("VERDICT: FAIL — L0 rule rearrangement insufficient")
+        print("Action: trigger L1 (web_search description expansion) as independent plan")
+        print("        also consider L2 (PLAN_EXEC phase policy) if L1 alone insufficient")
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(asyncio.run(main()))