44 lines
3.0 KiB
HTML
44 lines
3.0 KiB
HTML
<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="utf-8">
|
|
<title>AgentKit Benchmark Report</title>
|
|
<style>
|
|
body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; margin: 2em; }
|
|
h1 { color: #1a1a2e; }
|
|
.meta { color: #666; margin-bottom: 1em; }
|
|
table { border-collapse: collapse; width: 100%; margin: 1em 0; }
|
|
th, td { border: 1px solid #ddd; padding: 8px 12px; text-align: left; }
|
|
th { background-color: #1a1a2e; color: white; }
|
|
td.num { text-align: right; font-family: monospace; }
|
|
td.pass { color: #2e7d32; }
|
|
td.fail { color: #c62828; }
|
|
.score-good { color: #2e7d32; font-weight: bold; }
|
|
.score-warn { color: #e65100; font-weight: bold; }
|
|
.score-bad { color: #c62828; font-weight: bold; }
|
|
.overall-row { background-color: #f5f5f5; }
|
|
.failure { margin: 0.5em 0; padding: 0.5em; background: #fff3e0; border-left: 3px solid #ff9800; }
|
|
.failure .dim { color: #e65100; font-weight: bold; }
|
|
.failure .case { font-family: monospace; }
|
|
.failure .detail { font-size: 0.85em; color: #555; margin-left: 1em; }
|
|
.all-pass { color: #2e7d32; font-weight: bold; }
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<h1>AgentKit Benchmark Report</h1>
|
|
<div class="meta">
|
|
<p>Timestamp: 2026-06-17T03:26:25.072956+00:00</p>
|
|
<p>Version: 0.1.0</p>
|
|
<p>Overall Score: <strong>98.0%</strong></p>
|
|
<p>Summary: 50/51 tests passed (1 failed) across 7 dimensions.</p>
|
|
</div>
|
|
<h2>Dimension Results</h2>
|
|
<table>
|
|
<thead><tr><th>Dimension</th><th>Total</th><th>Pass</th><th>Fail</th><th>Score</th></tr></thead>
|
|
<tbody>
|
|
<tr><td>preprocessing</td><td class='num'>15</td><td class='num pass'>14</td><td class='num fail'>1</td><td class='num score-good'>93.3%</td></tr><tr><td>overfitting</td><td class='num'>3</td><td class='num pass'>3</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>efficiency</td><td class='num'>5</td><td class='num pass'>5</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>tool_search</td><td class='num'>10</td><td class='num pass'>10</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>event_model</td><td class='num'>6</td><td class='num pass'>6</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>spec_management</td><td class='num'>7</td><td class='num pass'>7</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr><td>verification</td><td class='num'>5</td><td class='num pass'>5</td><td class='num fail'>0</td><td class='num score-good'>100.0%</td></tr><tr class='overall-row'><td><strong>OVERALL</strong></td><td class='num'><strong>51</strong></td><td class='num pass'><strong>50</strong></td><td class='num fail'><strong>1</strong></td><td class='num score-good'><strong>98.0%</strong></td></tr>
|
|
</tbody>
|
|
</table>
|
|
<h2>Failed Cases</h2><div class='failure'><span class='dim'>[preprocessing]</span> <span class='case'>skill_prefix_direct</span><div class='detail'>expected: skill_react</div><div class='detail'>actual: direct_chat</div></div>
|
|
</body>
|
|
</html> |