test(u6): add L4 real-LLM smoke test for ReAct tool-use prompt

Manual smoke test verifying U4 L0 prompt rule rearrangement under real
LLM calls (bailian-coding/qwen3.7-plus). 5 probe queries covering
external_info / realtime_data / multi_step / realtime_simple / no_tool.

Results:
- Probe #1 external_info: PASS (8 web_search calls, 99.9s)
- Probe #2 realtime_data: ERROR (120s timeout, not LLM refusal)
- Probe #3 multi_step: PASS (8 web_search calls, 62.6s)
- Probe #4 realtime_data_simple: PASS (3 web_search calls, 23.8s)
- Probe #5 no_tool_escape_hatch: PASS (0 tool calls, direct answer, 4.2s)

Verdict: 3/4 tool-call pass (>=3/4 threshold) + 1/1 direct pass
Bug 2 status upgraded to 'L4 verified'.

Plan Progress table updated: U6 done, U7 done.
This commit is contained in:
chiguyong 2026-07-02 22:08:45 +08:00
parent 44f4f1c46f
commit 53347ed1fe
2 changed files with 308 additions and 3 deletions

View File

@ -18,12 +18,22 @@ status: in-progress
| U3 deleteConversation 补全 | done | 5 前端单测 pass | 7376005 | | U3 deleteConversation 补全 | done | 5 前端单测 pass | 7376005 |
| U4 ReAct prompt 规则重排 | done | 6 后端单测 pass | 7376005 | | U4 ReAct prompt 规则重排 | done | 6 后端单测 pass | 7376005 |
| U5 端到端验证测试 | done | 11 单测全 pass | 7376005 | | U5 端到端验证测试 | done | 11 单测全 pass | 7376005 |
| U6 Bug 2 L4 真实 LLM smoke test | pending | — | — | | U6 Bug 2 L4 真实 LLM smoke test | done | 3/4 tool-call + 1/1 direct pass | (本 commit) |
| U7 工作树未提交变更清理 | pending | — | — | | U7 工作树未提交变更清理 | done | git status 干净 + vitest 138/139 pass | 9e2ccf5..44f4f1c |
Bug 2 状态L4 verifiedL0 规则重排在真实 LLM 调用下生效)
L4 smoke test 结果2026-07-02bailian-coding/qwen3.7-plus
- Probe #1 external_info: PASS8 次 web_search 调用99.9s
- Probe #2 realtime_data: ERROR120s 超时,非 LLM 不调用工具)
- Probe #3 multi_step: PASS8 次 web_search 调用62.6s
- Probe #4 realtime_data_simple: PASS3 次 web_search 调用23.8s
- Probe #5 no_tool_escape_hatch: PASS0 次工具调用直接回答4.2s
- 判定3/4 tool-call pass达阈值 ≥3/4+ 1/1 direct pass → L4 verified
PR: http://8.153.107.96/gitea/fischer/fischer-agentkit/pulls/17 PR: http://8.153.107.96/gitea/fischer/fischer-agentkit/pulls/17
ce-code-review: mode:agent, 无 actionable findings ce-code-review: mode:agent, 无 actionable findings
ce-test-browser: agent-browser 已安装,待 U6 阶段执行 ce-test-browser: agent-browser 已安装U6 用脚本直接验证,未走前端)
## Problem Frame ## Problem Frame

View File

@ -0,0 +1,295 @@
"""ReAct L4 真实 LLM smoke test (U6).
手动验证 U4 L0 规则重排`_build_tool_use_prompt` "何时必须使用工具"前置
在真实 LLM 调用下是否生效 Agent 面对复杂需求时是否调用 `web_search` 而非
直接回答
不进 CI依赖真实 LLM API key + 网络运行方式
python3 tests/manual/test_react_l4_smoke.py
判定标准plan U6
- Probe 1-4 期望触发 web_search3/4 pass 算通过
- Probe 5 期望不触发工具调用验证 escape hatch 规则 4 仍有效
- 通过 Bug 2 状态升级为 "L4 verified"
- 未通过 触发 L1工具描述扩展独立 plan
ponytail: 直接复用 cli.chat._build_gateway + server.app._create_provider
不重新实现 provider 注册逻辑升级路径抽到 shared 模块供 cli/server/manual
共用
"""
from __future__ import annotations
import asyncio
import sys
import time
from dataclasses import dataclass
from pathlib import Path
# 确保可以 import agentkit 包(脚本从仓库根目录运行)
ROOT = Path(__file__).resolve().parent.parent.parent
sys.path.insert(0, str(ROOT / "src"))
from agentkit.core.react import ReActEngine, ReActResult, ReActStep # noqa: E402
from agentkit.server.config import find_config_path, load_config_with_dotenv # noqa: E402
from agentkit.tools.web_search import WebSearchTool # noqa: E402
# ---------------------------------------------------------------------------
# Probe queries
# ---------------------------------------------------------------------------
@dataclass
class Probe:
"""单个 probe query 及其期望行为。"""
id: int
category: str # external_info / realtime_data / multi_step / uncertain / no_tool
query: str
expect_tool_call: bool # True: 期望触发 web_searchFalse: 期望直接回答
PROBES: list[Probe] = [
Probe(
id=1,
category="external_info",
query="收集 GitHub Trending 前 10 个项目信息并分析商业价值",
expect_tool_call=True,
),
Probe(
id=2,
category="realtime_data",
query="最新 AI 领域有什么重要新闻?",
expect_tool_call=True,
),
Probe(
id=3,
category="multi_step",
query="对比 React 和 Vue 3 在大型项目中的性能差异,给出具体数据",
expect_tool_call=True,
),
Probe(
id=4,
category="realtime_data_simple",
query="今天上海天气怎么样?",
expect_tool_call=True,
),
Probe(
id=5,
category="no_tool_escape_hatch",
query="请帮我总结下面这段文字人工智能AI是计算机科学的一个分支"
"它企图了解智能的实质,并生产出一种新的能以人类智能相似的方式做出"
"反应的智能机器。AI 的研究包括机器人、语言识别、图像识别、自然语言"
"处理和专家系统等。",
expect_tool_call=False,
),
]
# ---------------------------------------------------------------------------
# Probe runner
# ---------------------------------------------------------------------------
def _count_tool_calls(result: ReActResult) -> tuple[int, list[str]]:
"""统计 trajectory 中的 tool_call 步骤,返回 (count, tool_names)。"""
tool_names: list[str] = []
for step in result.trajectory:
if step.action == "tool_call" and step.tool_name:
tool_names.append(step.tool_name)
return len(tool_names), tool_names
def _format_trajectory(steps: list[ReActStep]) -> str:
"""格式化 trajectory 用于报告输出。"""
lines: list[str] = []
for s in steps:
if s.action == "tool_call":
args_preview = str(s.arguments)[:80] if s.arguments else ""
lines.append(f" [{s.step}] tool_call: {s.tool_name}({args_preview})")
elif s.action == "final_answer":
preview = (s.content or "")[:120].replace("\n", " ")
lines.append(f" [{s.step}] final_answer: {preview}...")
else:
lines.append(f" [{s.step}] {s.action}")
return "\n".join(lines) if lines else " (empty)"
async def run_probe(engine: ReActEngine, probe: Probe) -> dict:
"""运行单个 probe返回结果字典。"""
print(f"\n{'='*60}")
print(f"Probe #{probe.id} [{probe.category}]")
print(f"Query: {probe.query[:80]}{'...' if len(probe.query) > 80 else ''}")
print(f"Expect tool_call: {probe.expect_tool_call}")
print(f"{'-'*60}")
messages = [{"role": "user", "content": probe.query}]
start = time.monotonic()
try:
result = await engine.execute(
messages=messages,
tools=[WebSearchTool()],
model="default",
agent_name="l4_smoke",
task_type="smoke_test",
)
except Exception as e:
elapsed = time.monotonic() - start
print(f"ERROR after {elapsed:.1f}s: {type(e).__name__}: {e}")
return {
"probe_id": probe.id,
"category": probe.category,
"expect_tool_call": probe.expect_tool_call,
"actual_tool_calls": 0,
"tool_names": [],
"status": "error",
"error": f"{type(e).__name__}: {e}",
"elapsed_s": elapsed,
"output_preview": "",
"trajectory": "",
}
elapsed = time.monotonic() - start
tool_count, tool_names = _count_tool_calls(result)
# 判定:实际触发工具 == 期望触发工具
actual_triggered = tool_count > 0
passed = actual_triggered == probe.expect_tool_call
# 特例query 5 期望无工具,但如果 LLM 调用了,也算"行为可观察"(只是不符 escape hatch 期望)
status = "pass" if passed else "fail"
output_preview = (result.output or "")[:200].replace("\n", " ")
traj_str = _format_trajectory(result.trajectory)
print(f"Status: {status.upper()} ({elapsed:.1f}s, {result.total_steps} steps)")
print(f"Tool calls: {tool_count} {tool_names}")
print(f"Output preview: {output_preview}")
print(f"Trajectory:\n{traj_str}")
return {
"probe_id": probe.id,
"category": probe.category,
"expect_tool_call": probe.expect_tool_call,
"actual_tool_calls": tool_count,
"tool_names": tool_names,
"status": status,
"error": None,
"elapsed_s": elapsed,
"output_preview": output_preview,
"trajectory": traj_str,
"total_steps": result.total_steps,
"total_tokens": result.total_tokens,
"react_status": result.status,
}
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
async def main() -> int:
"""运行所有 probe输出报告返回退出码0=pass, 1=fail"""
print("=" * 60)
print("ReAct L4 Smoke Test (U6)")
print("Verifies U4 L0 prompt rule rearrangement under real LLM calls")
print("=" * 60)
# 1. 加载 server config从 agentkit.yaml + env
print("\n[1/3] Loading server config...")
try:
config_path = find_config_path()
if not config_path:
print("FATAL: no agentkit.yaml found (./agentkit.yaml or ~/.agentkit/agentkit.yaml)")
return 2
server_config = load_config_with_dotenv(config_path)
except Exception as e:
print(f"FATAL: failed to load config: {type(e).__name__}: {e}")
return 2
if not server_config.llm_config.providers:
print("FATAL: no LLM providers configured (check agentkit.yaml / .env)")
return 2
providers_with_key = [
name for name, p in server_config.llm_config.providers.items()
if p.api_key
]
if not providers_with_key:
print("FATAL: no LLM providers with api_key set")
print("Set env vars (e.g. OPENAI_API_KEY) or configure agentkit.yaml")
return 2
print(f" Providers with key: {providers_with_key}")
print(f" Default model alias: {server_config.llm_config.model_aliases.get('default', '<unset>')}")
# 2. 构建 LLM gateway + ReAct engine
print("\n[2/3] Building LLM gateway + ReAct engine...")
from agentkit.cli.chat import _build_gateway
gateway = _build_gateway(server_config)
if not gateway.has_providers:
print("FATAL: gateway has no registered providers")
return 2
print(f" Gateway providers: {list(gateway._providers.keys())}")
engine = ReActEngine(
llm_gateway=gateway,
max_steps=8, # 留够步数让 LLM 多轮搜索
default_timeout=120.0, # 单 query 上限 2 分钟
enable_tool_search=False, # 简化:直接注入 web_search 完整描述
)
# 3. 运行所有 probe
print(f"\n[3/3] Running {len(PROBES)} probes...")
results: list[dict] = []
for probe in PROBES:
# 每次重置 engine 状态(避免 loop detection 跨 query 误判)
engine.reset()
result = await run_probe(engine, probe)
results.append(result)
# 4. 汇总报告
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"{'#':>3} {'Category':<24} {'Expect':>8} {'Actual':>8} {'Status':>6} {'Time':>6}")
print("-" * 60)
for r in results:
expect_str = "tool" if r["expect_tool_call"] else "direct"
actual_str = f"{r['actual_tool_calls']} tool" if r["actual_tool_calls"] > 0 else "direct"
print(
f"{r['probe_id']:>3} "
f"{r['category']:<24} "
f"{expect_str:>8} "
f"{actual_str:>8} "
f"{r['status']:>6} "
f"{r['elapsed_s']:>5.1f}s"
)
# 5. 判定
print("\n" + "-" * 60)
tool_probes = [r for r in results if r["expect_tool_call"]]
direct_probes = [r for r in results if not r["expect_tool_call"]]
tool_pass = sum(1 for r in tool_probes if r["status"] == "pass")
direct_pass = sum(1 for r in direct_probes if r["status"] == "pass")
print(f"Tool-call probes: {tool_pass}/{len(tool_probes)} passed (threshold: ≥3/4)")
print(f"Direct-answer probes: {direct_pass}/{len(direct_probes)} passed")
overall_pass = (tool_pass >= 3) and (direct_pass == len(direct_probes))
print("\n" + "=" * 60)
if overall_pass:
print("VERDICT: PASS — Bug 2 status upgraded to 'L4 verified'")
print("Action: update plan Progress table U6 → done")
return 0
else:
print("VERDICT: FAIL — L0 rule rearrangement insufficient")
print("Action: trigger L1 (web_search description expansion) as independent plan")
print(" also consider L2 (PLAN_EXEC phase policy) if L1 alone insufficient")
return 1
if __name__ == "__main__":
sys.exit(asyncio.run(main()))