fischer-agentkit/test-results/benchmark/benchmark_report.html

44 lines
3.0 KiB
HTML

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>AgentKit Benchmark Report</title>
<style>
body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; margin: 2em; }
h1 { color: #1a1a2e; }
.meta { color: #666; margin-bottom: 1em; }
table { border-collapse: collapse; width: 100%; margin: 1em 0; }
th, td { border: 1px solid #ddd; padding: 8px 12px; text-align: left; }
th { background-color: #1a1a2e; color: white; }
td.num { text-align: right; font-family: monospace; }
td.pass { color: #2e7d32; }
td.fail { color: #c62828; }
.score-good { color: #2e7d32; font-weight: bold; }
.score-warn { color: #e65100; font-weight: bold; }
.score-bad { color: #c62828; font-weight: bold; }
.overall-row { background-color: #f5f5f5; }
.failure { margin: 0.5em 0; padding: 0.5em; background: #fff3e0; border-left: 3px solid #ff9800; }
.failure .dim { color: #e65100; font-weight: bold; }
.failure .case { font-family: monospace; }
.failure .detail { font-size: 0.85em; color: #555; margin-left: 1em; }
.all-pass { color: #2e7d32; font-weight: bold; }
</style>
</head>
<body>
<h1>AgentKit Benchmark Report</h1>
<div class="meta">
<p>Timestamp: 2026-06-17T03:26:25.072956+00:00</p>
<p>Version: 0.1.0</p>
<p>Overall Score: <strong>98.0%</strong></p>
<p>Summary: 50/51 tests passed (1 failed) across 7 dimensions.</p>
</div>
<h2>Dimension Results</h2>
<table>
<thead><tr><th>Dimension</th><th>Total</th><th>Pass</th><th>Fail</th><th>Score</th></tr></thead>
<tbody>
<tr><td>preprocessing</td><td class='num'>15</td><td class='num pass'>14</td><td class='num fail'>1</td><td class='num score-good'>93.3%</td></tr><tr><td>overfitting</td><td class='num'>3</td><td class='num pass'>3</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>efficiency</td><td class='num'>5</td><td class='num pass'>5</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>tool_search</td><td class='num'>10</td><td class='num pass'>10</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>event_model</td><td class='num'>6</td><td class='num pass'>6</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>spec_management</td><td class='num'>7</td><td class='num pass'>7</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>verification</td><td class='num'>5</td><td class='num pass'>5</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr class='overall-row'><td><strong>OVERALL</strong></td><td class='num'><strong>51</strong></td><td class='num pass'><strong>50</strong></td><td class='num fail'><strong>1</strong></td><td class='num score-good'><strong>98.0%</strong></td></tr>
</tbody>
</table>
<h2>Failed Cases</h2><div class='failure'><span class='dim'>[preprocessing]</span> <span class='case'>skill_prefix_direct</span><div class='detail'>expected: skill_react</div><div class='detail'>actual: direct_chat</div></div>
</body>
</html>